diff -Nru dm-writeboost-2.1.1/ChangeLog dm-writeboost-2.2.6/ChangeLog --- dm-writeboost-2.1.1/ChangeLog 2016-01-04 13:25:39.000000000 +0000 +++ dm-writeboost-2.2.6/ChangeLog 2016-09-19 06:15:04.000000000 +0000 @@ -1,3 +1,60 @@ +2016-09-19 Akira Hayakawa + + * v2.2.6 + * Clarify producer-consumer pattern + * Fix build error with 3.10 kernel + * Fix build error with 3.14 kernel + +2016-09-12 Akira Hayakawa + + * v2.2.5 + * Fix read-caching data corruption issue + * Insert memory barriers + * Code cleanup + +2016-08-28 Akira Hayakawa + + * v2.2.4 + * Fix update_sb_record_interval + * Throttle writeback when there are only few empty segments in the + caching device + * Remove experimental from read-caching + +2016-08-02 Akira Hayakawa + + * v2.2.3 + * Rename write_through_mode to write_around_mode because it's more + precise + * Reformat the caching device when it's write_around_mode + +2016-07-30 Akira Hayakawa + + * v2.2.2 + * Use kmap_atomic() to access the bio payload + * Fix doc (clear_stat) + +2016-07-18 Akira Hayakawa + + * v2.2.1 + * Unsupport TRIM + * Fixes (fail if partial read from caching device fails etc.) + +2016-05-01 Akira Hayakawa + + * v2.2.0 + * Remove partial writeback in foreground. This results in writing + back cached data strictly from the older ones, which makes cache + device corruption safer + * Fix build error for kernel 4.6. per_bio_data_size is renamed to + per_io_data_size + * Remove SECTOR_SHIFT + +2016-03-05 Akira Hayakawa + + * v2.1.2 + * Remove blockup mechanism + * Use vmalloc for read_cache_cell's buffer + 2016-01-04 Akira Hayakawa * v2.1.1 diff -Nru dm-writeboost-2.1.1/debian/changelog dm-writeboost-2.2.6/debian/changelog --- dm-writeboost-2.1.1/debian/changelog 2017-02-06 13:55:59.000000000 +0000 +++ dm-writeboost-2.2.6/debian/changelog 2017-07-14 11:07:17.000000000 +0000 @@ -1,8 +1,55 @@ -dm-writeboost (2.1.1-1ubuntu1) xenial; urgency=medium +dm-writeboost (2.2.6-1~16.04.1) xenial; urgency=low - * Add kernel 4.6/4.8 compat code (LP: #1662107) + * Backport to xenial to support linux-hwe 4.10 kernels. + (LP: #1704280) - -- Stefan Bader Mon, 06 Feb 2017 11:39:32 +0100 + -- Andy Whitcroft Fri, 14 Jul 2017 12:07:17 +0100 + +dm-writeboost (2.2.6-1) unstable; urgency=medium + + * New upstream release [September 2016]. + + fixed FTBFS with Linux 4.8 (Closes: #838547). + + -- Dmitry Smirnov Thu, 22 Sep 2016 21:26:58 +1000 + +dm-writeboost (2.2.5-1) unstable; urgency=medium + + * New upstream release [September 2016]. + + -- Dmitry Smirnov Tue, 13 Sep 2016 07:59:37 +1000 + +dm-writeboost (2.2.4-1) unstable; urgency=medium + + * New upstream release [August 2016]. + + -- Dmitry Smirnov Sun, 28 Aug 2016 21:02:43 +1000 + +dm-writeboost (2.2.3-1) unstable; urgency=medium + + * New upstream release [August 2016]. + + -- Dmitry Smirnov Tue, 02 Aug 2016 22:41:47 +1000 + +dm-writeboost (2.2.1-1) unstable; urgency=medium + + * New upstream release [July 2016]. + * Vcs-Git URL to HTTPS. + + -- Dmitry Smirnov Mon, 18 Jul 2016 19:31:34 +1000 + +dm-writeboost (2.2.0-1) unstable; urgency=medium + + * New upstream release [May 2016]. + * Standards-Version: 3.9.8. + + -- Dmitry Smirnov Sun, 01 May 2016 21:26:05 +1000 + +dm-writeboost (2.1.2-1) unstable; urgency=medium + + * New upstream release [March 2016]. + * Standards-Version: 3.9.7. + + -- Dmitry Smirnov Sun, 06 Mar 2016 00:16:26 +1100 dm-writeboost (2.1.1-1) unstable; urgency=medium diff -Nru dm-writeboost-2.1.1/debian/control dm-writeboost-2.2.6/debian/control --- dm-writeboost-2.1.1/debian/control 2015-07-08 17:06:56.000000000 +0000 +++ dm-writeboost-2.2.6/debian/control 2017-07-14 11:07:17.000000000 +0000 @@ -1,12 +1,13 @@ Source: dm-writeboost Section: kernel Priority: optional -Maintainer: Dmitry Smirnov +Maintainer: Ubuntu Developers +XSBC-Original-Maintainer: Dmitry Smirnov Build-Depends: debhelper (>= 9), dkms -Standards-Version: 3.9.6 +Standards-Version: 3.9.8 Homepage: https://github.com/akiradeveloper/dm-writeboost -Vcs-Browser: http://anonscm.debian.org/cgit/collab-maint/dm-writeboost.git -Vcs-Git: git://anonscm.debian.org/collab-maint/dm-writeboost.git +Vcs-Browser: https://anonscm.debian.org/cgit/collab-maint/dm-writeboost.git +Vcs-Git: https://anonscm.debian.org/git/collab-maint/dm-writeboost.git Package: dm-writeboost-dkms Architecture: all diff -Nru dm-writeboost-2.1.1/debian/patches/add-compat-4.6.patch dm-writeboost-2.2.6/debian/patches/add-compat-4.6.patch --- dm-writeboost-2.1.1/debian/patches/add-compat-4.6.patch 2017-02-06 13:54:32.000000000 +0000 +++ dm-writeboost-2.2.6/debian/patches/add-compat-4.6.patch 1970-01-01 00:00:00.000000000 +0000 @@ -1,38 +0,0 @@ -Description: Add compat code for kernel 4.6+ - Rename per_bio_data_size -> per_io_data_size -Forwarded: yes - -Index: dm-writeboost-2.1.1/src/dm-writeboost-target.c -=================================================================== ---- dm-writeboost-2.1.1.orig/src/dm-writeboost-target.c -+++ dm-writeboost-2.1.1/src/dm-writeboost-target.c -@@ -881,6 +881,11 @@ enum PBD_FLAG { - PBD_READ_SEG = 2, - }; - -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,6,0) -+#define PER_BIO_DATA_SIZE per_io_data_size -+#else -+#define PER_BIO_DATA_SIZE per_bio_data_size -+#endif - struct per_bio_data { - enum PBD_FLAG type; - union { -@@ -888,7 +893,7 @@ struct per_bio_data { - struct segment_header *seg; - }; - }; --#define per_bio_data(wb, bio) ((struct per_bio_data *)dm_per_bio_data((bio), (wb)->ti->per_bio_data_size)) -+#define per_bio_data(wb, bio) ((struct per_bio_data *)dm_per_bio_data((bio), (wb)->ti->PER_BIO_DATA_SIZE)) - - static void reserve_read_cache_cell(struct wb_device *, struct bio *); - static int process_read(struct wb_device *wb, struct bio *bio) -@@ -1461,7 +1466,7 @@ static int init_core_struct(struct dm_ta - ti->num_flush_bios = 1; - ti->num_discard_bios = 1; - ti->discard_zeroes_data_unsupported = true; -- ti->per_bio_data_size = sizeof(struct per_bio_data); -+ ti->PER_BIO_DATA_SIZE = sizeof(struct per_bio_data); - - wb = kzalloc(sizeof(*wb), GFP_KERNEL); - if (!wb) { diff -Nru dm-writeboost-2.1.1/debian/patches/add-compat-4.8.patch dm-writeboost-2.2.6/debian/patches/add-compat-4.8.patch --- dm-writeboost-2.1.1/debian/patches/add-compat-4.8.patch 2017-02-06 13:54:41.000000000 +0000 +++ dm-writeboost-2.2.6/debian/patches/add-compat-4.8.patch 1970-01-01 00:00:00.000000000 +0000 @@ -1,209 +0,0 @@ -Description: Fix FTBS of DKMS package with 4.8+ kernels - Add compat code to handle rename/split of bi_rw. -Author: Stefan Bader -Index: dm-writeboost-2.1.1/src/dm-writeboost-daemon.c -=================================================================== ---- dm-writeboost-2.1.1.orig/src/dm-writeboost-daemon.c -+++ dm-writeboost-2.1.1/src/dm-writeboost-daemon.c -@@ -85,7 +85,11 @@ void flush_proc(struct work_struct *work - - struct dm_io_request io_req = { - .client = wb->io_client, -+#if LINUX_VERSION_CODE < KERNEL_VERSION(4,8,0) - .bi_rw = WRITE, -+#else -+ .bi_op = REQ_OP_WRITE, -+#endif - .notify.fn = NULL, - .mem.type = DM_IO_VMA, - .mem.ptr.addr = rambuf->data, -@@ -142,7 +146,11 @@ static void submit_writeback_io(struct w - if (writeback_io->data_bits == 255) { - struct dm_io_request io_req_w = { - .client = wb->io_client, -+#if LINUX_VERSION_CODE < KERNEL_VERSION(4,8,0) - .bi_rw = WRITE, -+#else -+ .bi_op = REQ_OP_WRITE, -+#endif - .notify.fn = writeback_endio, - .notify.context = wb, - .mem.type = DM_IO_VMA, -@@ -168,7 +176,11 @@ static void submit_writeback_io(struct w - - io_req_w = (struct dm_io_request) { - .client = wb->io_client, -+#if LINUX_VERSION_CODE < KERNEL_VERSION(4,8,0) - .bi_rw = WRITE, -+#else -+ .bi_op = REQ_OP_WRITE, -+#endif - .notify.fn = writeback_endio, - .notify.context = wb, - .mem.type = DM_IO_VMA, -@@ -267,7 +279,11 @@ static void prepare_writeback_ios(struct - - struct dm_io_request io_req_r = { - .client = wb->io_client, -+#if LINUX_VERSION_CODE < KERNEL_VERSION(4,8,0) - .bi_rw = READ, -+#else -+ .bi_op = REQ_OP_READ, -+#endif - .notify.fn = NULL, - .mem.type = DM_IO_VMA, - .mem.ptr.addr = writeback_seg->buf, -@@ -467,7 +483,12 @@ static void update_superblock_record(str - - io_req = (struct dm_io_request) { - .client = wb->io_client, -+#if LINUX_VERSION_CODE < KERNEL_VERSION(4,8,0) - .bi_rw = WRITE_FUA, -+#else -+ .bi_op = REQ_OP_WRITE, -+ .bi_op_flags = WRITE_FUA, -+#endif - .notify.fn = NULL, - .mem.type = DM_IO_KMEM, - .mem.ptr.addr = buf, -Index: dm-writeboost-2.1.1/src/dm-writeboost-metadata.c -=================================================================== ---- dm-writeboost-2.1.1.orig/src/dm-writeboost-metadata.c -+++ dm-writeboost-2.1.1/src/dm-writeboost-metadata.c -@@ -321,7 +321,11 @@ static int read_superblock_header(struct - - io_req_sup = (struct dm_io_request) { - .client = wb->io_client, -+#if LINUX_VERSION_CODE < KERNEL_VERSION(4,8,0) - .bi_rw = READ, -+#else -+ .bi_op = REQ_OP_READ, -+#endif - .notify.fn = NULL, - .mem.type = DM_IO_KMEM, - .mem.ptr.addr = buf, -@@ -385,7 +389,12 @@ static int format_superblock_header(stru - - io_req_sup = (struct dm_io_request) { - .client = wb->io_client, -+#if LINUX_VERSION_CODE < KERNEL_VERSION(4,8,0) - .bi_rw = WRITE_FUA, -+#else -+ .bi_op = REQ_OP_WRITE, -+ .bi_op_flags = WRITE_FUA, -+#endif - .notify.fn = NULL, - .mem.type = DM_IO_KMEM, - .mem.ptr.addr = buf, -@@ -479,7 +488,11 @@ static int format_all_segment_headers(st - for (i = 0; i < wb->nr_segments; i++) { - struct dm_io_request io_req_seg = { - .client = wb->io_client, -+#if LINUX_VERSION_CODE < KERNEL_VERSION(4,8,0) - .bi_rw = WRITE, -+#else -+ .bi_op = REQ_OP_WRITE, -+#endif - .notify.fn = format_segmd_endio, - .notify.context = &context, - .mem.type = DM_IO_KMEM, -@@ -653,7 +666,11 @@ static int read_superblock_record(struct - - io_req = (struct dm_io_request) { - .client = wb->io_client, -+#if LINUX_VERSION_CODE < KERNEL_VERSION(4,8,0) - .bi_rw = READ, -+#else -+ .bi_op = REQ_OP_READ, -+#endif - .notify.fn = NULL, - .mem.type = DM_IO_KMEM, - .mem.ptr.addr = buf, -@@ -682,7 +699,11 @@ static int read_whole_segment(void *buf, - { - struct dm_io_request io_req = { - .client = wb->io_client, -+#if LINUX_VERSION_CODE < KERNEL_VERSION(4,8,0) - .bi_rw = READ, -+#else -+ .bi_op = REQ_OP_READ, -+#endif - .notify.fn = NULL, - .mem.type = DM_IO_VMA, - .mem.ptr.addr = buf, -@@ -778,7 +799,11 @@ static int read_segment_header(void *buf - { - struct dm_io_request io_req = { - .client = wb->io_client, -+#if LINUX_VERSION_CODE < KERNEL_VERSION(4,8,0) - .bi_rw = READ, -+#else -+ .bi_op = REQ_OP_READ, -+#endif - .notify.fn = NULL, - .mem.type = DM_IO_KMEM, - .mem.ptr.addr = buf, -Index: dm-writeboost-2.1.1/src/dm-writeboost-target.c -=================================================================== ---- dm-writeboost-2.1.1.orig/src/dm-writeboost-target.c -+++ dm-writeboost-2.1.1/src/dm-writeboost-target.c -@@ -106,9 +106,16 @@ int wb_io_internal(struct wb_device *wb, - eb = *err_bits; - - format_dev_t(buf, dev); -+#if LINUX_VERSION_CODE < KERNEL_VERSION(4,8,0) - DMERR("%s() I/O error(%d), bits(%lu), dev(%s), sector(%llu), rw(%d)", - caller, err, eb, - buf, (unsigned long long) regions->sector, io_req->bi_rw); -+#else -+ DMERR("%s() I/O error(%d), bits(%lu), dev(%s), sector(%llu), op(%d), op_flags(%d)", -+ caller, err, eb, -+ buf, (unsigned long long) regions->sector, -+ io_req->bi_op, io_req->bi_op_flags); -+#endif - } - - return err; -@@ -553,7 +560,11 @@ static void writeback_buffered_mb(struct - memcpy(buf, src, 1 << SECTOR_SHIFT); - io_req = (struct dm_io_request) { - .client = wb->io_client, -+#if LINUX_VERSION_CODE < KERNEL_VERSION(4,8,0) - .bi_rw = WRITE, -+#else -+ .bi_op = REQ_OP_WRITE, -+#endif - .notify.fn = NULL, - .mem.type = DM_IO_KMEM, - .mem.ptr.addr = buf, -@@ -802,7 +813,11 @@ static int do_process_write(struct wb_de - * bio with REQ_FUA has data. - * For such bio, we first treat it like a normal bio and then as a REQ_FLUSH bio. - */ -+#if LINUX_VERSION_CODE < KERNEL_VERSION(4,8,0) - if (bio->bi_rw & REQ_FUA) { -+#else -+ if (bio->bi_opf & REQ_FUA) { -+#endif - queue_barrier_io(wb, bio); - return DM_MAPIO_SUBMITTED; - } -@@ -963,10 +978,18 @@ static int writeboost_map(struct dm_targ - struct per_bio_data *pbd = per_bio_data(wb, bio); - pbd->type = PBD_NONE; - -+#if LINUX_VERSION_CODE < KERNEL_VERSION(4,8,0) - if (bio->bi_rw & REQ_DISCARD) -+#else -+ if (bio_op(bio) == REQ_OP_DISCARD) -+#endif - return process_discard_bio(wb, bio); - -+#if LINUX_VERSION_CODE < KERNEL_VERSION(4,8,0) - if (bio->bi_rw & REQ_FLUSH) -+#else -+ if (bio->bi_opf & REQ_PREFLUSH) -+#endif - return process_flush_bio(wb, bio); - - return process_bio(wb, bio); diff -Nru dm-writeboost-2.1.1/debian/patches/series dm-writeboost-2.2.6/debian/patches/series --- dm-writeboost-2.1.1/debian/patches/series 2017-02-06 13:51:17.000000000 +0000 +++ dm-writeboost-2.2.6/debian/patches/series 1970-01-01 00:00:00.000000000 +0000 @@ -1,2 +0,0 @@ -add-compat-4.6.patch -add-compat-4.8.patch diff -Nru dm-writeboost-2.1.1/doc/dm-writeboost-readme.txt dm-writeboost-2.2.6/doc/dm-writeboost-readme.txt --- dm-writeboost-2.1.1/doc/dm-writeboost-readme.txt 2016-01-04 13:25:39.000000000 +0000 +++ dm-writeboost-2.2.6/doc/dm-writeboost-readme.txt 2016-09-19 06:15:04.000000000 +0000 @@ -1,21 +1,20 @@ dm-writeboost ============= dm-writeboost target provides block-level log-structured caching. -All cache data, writes and reads, are written to the cache device in sequential -manner. +All writes and reads are written to the caching device in sequential manner. Mechanism ========= -Control three layers (RAM buffer, cache device and backing device) ------------------------------------------------------------------- -dm-writeboost controls three different layers - RAM buffer (rambuf), cache +Control three layers (RAM buffer, caching device and backing device) +-------------------------------------------------------------------- +dm-writeboost controls three different layers - RAM buffer (rambuf), caching device (cache_dev, e.g SSD) and backing device (backing_dev, e.g. HDD). All data are first stored in the RAM buffer and when the RAM buffer is full, dm-writeboost adds metadata block (with checksum) on the RAM buffer to create a -"log". Afterward, the log is written to the cache device as background -processing in sequential manner and thereafter it's written back to the backing -device in background as well. +"log". Afterward, the log is written to the caching device sequentially by a +background thread and thereafter written back to the backing device in the +background as well. dm-writeboost vs dm-cache or bcache @@ -34,20 +33,20 @@ it may not be the best when the ave. I/O size is very large in your workload. However, if the splitting overhead aside, dm-writeboost is always the best of all because it caches data in sequential manner - the most efficient I/O pattern -for the SSD cache device in terms of performance. +yet for the SSD caching device in terms of performance. It's known from experiments that dm-writeboost performs no good when you create a dm-writeboost'd device in virtual environment like KVM. So, keep in mind to -use this driver in the host (or physical) machine. +use this driver in a physical machine. How To Use dm-writeboost ======================== -Trigger cache device reformat ------------------------------ -The cache device is triggered reformating only if the first one sector of the -cache device is zeroed out. Note that this operation should be omitted when -you resume the cache device. +Trigger caching device reformat +------------------------------- +The caching device is triggered reformating only if the first one sector of the +caching device is zeroed out. Note that this operation should be omitted when +you resume the caching device. e.g. dd if=/dev/zero of=/dev/mapper/wbdev oflag=direct bs=512 count=1 Construct dm-writeboost'd device @@ -77,23 +76,26 @@ Shut down the system -------------------- On shutting down the system, you don't need to do anything at all. The data -and metadata is safely saved on the cache device. But, if you want to do +and metadata is safely saved on the caching device. But, if you want to do deconstruct the device manually, use dmsetup remove. Resume after system reboot -------------------------- To resume your caching device of the on-disk state, run dmsetup create command -with the same parameter but DO NOT zero out the first sector of the cache device. -This replays the logs on the cache device to rebuild the internal data structures. +with the same parameter but DO NOT zero out the first sector of the caching device. +This replays the logs on the caching device to rebuild the internal data structures. -Remove cache device -------------------- -If you want to detach your cache device for some reasons (you don't like -dm-writeboost anymore or you try to upgrade the cache device to a newly +Remove caching device +--------------------- +If you want to detach your caching device for some reasons (you don't like +dm-writeboost anymore or you try to upgrade the caching device to a newly perchased device) the safest way to do this is clean the dirty data up from your -cache device first and then deconstrust the dm-writeboost'd device. -You can use drop_caches message to forcibly clean up your cache device. +caching device first and then deconstrust the dm-writeboost'd device. +You can do this by first suspend/resuming the device to drop all transient data +from RAM buffer and then sending drop_caches message to drop dirty cache blocks +from the caching device. e.g. +dmsetup suspend wbdev; dmsetup resume wbdev dmsetup message wbdev 0 drop_caches dmsetup remove wbdev @@ -107,7 +109,7 @@ nr_max_batched_writeback accepts: 1..32 - default: 8 + default: 32 As optimization, dm-writeboost writes back $nr_max_batched_writeback segments simultaneously. The dirty caches in the segments are sorted in ascending order of the destination address and then written back. Setting large value can boost @@ -126,16 +128,15 @@ default: 0 (disabled) Sync all the volatile data every $sync_data_interval second. 0 means disabled. -read_cache_threshold (int) [Experimental] +read_cache_threshold (int) accepts: 0..127 default: 0 (read caching disabled) More than $read_cache_threshold * 4KB consecutive reads won't be staged. -write_through_mode (bool) +write_around_mode (bool) accepts: 0..1 default: 0 -By enabling this, dm-writeboost never cache dirty data by writing data directly -to the backing device. +By enabling this, dm-writeboost writes data directly to the backing device. Messages -------- @@ -153,9 +154,9 @@ (2) Others drop_caches - Wait for all dirty data on the cache device to be written back to the backing + Wait for all dirty data on the caching device to be written back to the backing device. This is interruptible. -clear_stats +clear_stat Clear the statistic info (see `Status`). Status diff -Nru dm-writeboost-2.1.1/Makefile dm-writeboost-2.2.6/Makefile --- dm-writeboost-2.1.1/Makefile 2016-01-04 13:25:39.000000000 +0000 +++ dm-writeboost-2.2.6/Makefile 2016-09-19 06:15:04.000000000 +0000 @@ -1,4 +1,4 @@ -MODULE_VERSION ?= 2.1.1 +MODULE_VERSION ?= 2.2.6 DKMS_DIR := /usr/src/dm-writeboost-$(MODULE_VERSION) DKMS_KEY := -m dm-writeboost -v $(MODULE_VERSION) diff -Nru dm-writeboost-2.1.1/README.md dm-writeboost-2.2.6/README.md --- dm-writeboost-2.1.1/README.md 2016-01-04 13:25:39.000000000 +0000 +++ dm-writeboost-2.2.6/README.md 2016-09-19 06:15:04.000000000 +0000 @@ -36,15 +36,15 @@ for system shutdown because dm-writeboost is even durable even against sudden power failure. ## Distribution Packages -- Debian: [Stretch](https://packages.debian.org/testing/dm-writeboost-dkms), [Sid](https://packages.debian.org/sid/dm-writeboost-dkms) -- Ubuntu: [Wily](http://packages.ubuntu.com/wily/dm-writeboost-dkms) +- Debian: [Stretch](https://packages.debian.org/source/testing/dm-writeboost), [Sid](https://packages.debian.org/source/sid/dm-writeboost) +- Ubuntu: [Yakkety](http://packages.ubuntu.com/yakkety/kernel/dm-writeboost-dkms), [Xenial](http://packages.ubuntu.com/xenial/dm-writeboost-dkms), [Wily](http://packages.ubuntu.com/wily/dm-writeboost-dkms) - [Tanglu](http://packages.tanglu.org/ja/dasyatis/kernel/dm-writeboost-dkms) - Momonga ## Related Projects * https://github.com/akiradeveloper/dm-writeboost-tools: Tools to help users analyze the state of the cache device * https://gitlab.com/onlyjob/writeboost: A management tool including init script -* https://github.com/jthornber/device-mapper-test-suite: Testing framework written in Ruby +* https://github.com/akiradeveloper/writeboost-test-suite: Testing framework written in Scala ## Related works * Y. Hu and Q. Yang -- DCD Disk Caching Disk: A New Approach for Boosting I/O Performance (1995) diff -Nru dm-writeboost-2.1.1/src/dkms.conf dm-writeboost-2.2.6/src/dkms.conf --- dm-writeboost-2.1.1/src/dkms.conf 2016-01-04 13:25:39.000000000 +0000 +++ dm-writeboost-2.2.6/src/dkms.conf 2016-09-19 06:15:04.000000000 +0000 @@ -1,5 +1,5 @@ PACKAGE_NAME="dm-writeboost" -PACKAGE_VERSION="2.1.1" +PACKAGE_VERSION="2.2.6" BUILT_MODULE_NAME="dm-writeboost" DEST_MODULE_LOCATION="/kernel/drivers/md" MAKE="make all KERNEL_TREE=$kernel_source_dir" diff -Nru dm-writeboost-2.1.1/src/dm-writeboost-daemon.c dm-writeboost-2.2.6/src/dm-writeboost-daemon.c --- dm-writeboost-2.1.1/src/dm-writeboost-daemon.c 2016-01-04 13:25:39.000000000 +0000 +++ dm-writeboost-2.2.6/src/dm-writeboost-daemon.c 2016-09-19 06:15:04.000000000 +0000 @@ -52,72 +52,92 @@ /*----------------------------------------------------------------------------*/ -static void process_deferred_barriers(struct wb_device *wb, struct flush_job *job) +static void process_deferred_barriers(struct wb_device *wb, struct rambuffer *rambuf) { - int r = 0; - bool has_barrier = !bio_list_empty(&job->barrier_ios); - - /* Make all the preceding data persistent. */ - if (has_barrier) - maybe_IO(blkdev_issue_flush(wb->cache_dev->bdev, GFP_NOIO, NULL)); - - /* Ack the chained barrier requests. */ + bool has_barrier = !bio_list_empty(&rambuf->barrier_ios); if (has_barrier) { struct bio *bio; - while ((bio = bio_list_pop(&job->barrier_ios))) { - if (is_live(wb)) - bio_endio_compat(bio, 0); - else - bio_endio_compat(bio, -EIO); - } + + /* Make all the preceding data persistent. */ + int err = blkdev_issue_flush(wb->cache_dev->bdev, GFP_NOIO, NULL); + + /* Ack the chained barrier requests. */ + while ((bio = bio_list_pop(&rambuf->barrier_ios))) + bio_endio_compat(bio, err); } } -void flush_proc(struct work_struct *work) +static bool should_flush(struct wb_device *wb) { - int r = 0; + return atomic64_read(&wb->last_queued_segment_id) > + atomic64_read(&wb->last_flushed_segment_id); +} - struct flush_job *job = container_of(work, struct flush_job, work); - struct rambuffer *rambuf = container_of(job, struct rambuffer, job); +static void do_flush_proc(struct wb_device *wb) +{ + struct segment_header *seg; + struct rambuffer *rambuf; + u64 id; + struct dm_io_request io_req; + struct dm_io_region region; - struct wb_device *wb = job->wb; - struct segment_header *seg = job->seg; + if (!should_flush(wb)) { + schedule_timeout_interruptible(msecs_to_jiffies(1000)); + return; + } + + id = atomic64_read(&wb->last_flushed_segment_id) + 1; - struct dm_io_request io_req = { + smp_rmb(); + + rambuf = get_rambuffer_by_id(wb, id); + seg = rambuf->seg; + + io_req = (struct dm_io_request) { + WB_IO_WRITE, .client = wb->io_client, - .bi_rw = WRITE, .notify.fn = NULL, .mem.type = DM_IO_VMA, .mem.ptr.addr = rambuf->data, }; - struct dm_io_region region = { + region = (struct dm_io_region) { .bdev = wb->cache_dev->bdev, .sector = seg->start_sector, .count = (seg->length + 1) << 3, }; - maybe_IO(wb_io(&io_req, 1, ®ion, NULL, false)); + if (wb_io(&io_req, 1, ®ion, NULL, false)) + return; /* * Deferred ACK for barrier requests * To serialize barrier ACK in logging we wait for the previous segment * to be persistently written (if needed). */ - wait_for_flushing(wb, SUB_ID(seg->id, 1)); - process_deferred_barriers(wb, job); + process_deferred_barriers(wb, rambuf); /* * We can count up the last_flushed_segment_id only after segment * is written persistently. Counting up the id is serialized. */ + smp_wmb(); atomic64_inc(&wb->last_flushed_segment_id); wake_up(&wb->flush_wait_queue); } +int flush_daemon_proc(void *data) +{ + struct wb_device *wb = data; + while (!kthread_should_stop()) + do_flush_proc(wb); + return 0; +} + void wait_for_flushing(struct wb_device *wb, u64 id) { wait_event(wb->flush_wait_queue, atomic64_read(&wb->last_flushed_segment_id) >= id); + smp_rmb(); } /*----------------------------------------------------------------------------*/ @@ -135,14 +155,12 @@ static void submit_writeback_io(struct wb_device *wb, struct writeback_io *writeback_io) { - int r; - - BUG_ON(!writeback_io->data_bits); + ASSERT(writeback_io->data_bits > 0); if (writeback_io->data_bits == 255) { struct dm_io_request io_req_w = { + WB_IO_WRITE, .client = wb->io_client, - .bi_rw = WRITE, .notify.fn = writeback_endio, .notify.context = wb, .mem.type = DM_IO_VMA, @@ -153,9 +171,8 @@ .sector = writeback_io->sector, .count = 1 << 3, }; - maybe_IO(wb_io(&io_req_w, 1, ®ion_w, NULL, false)); - if (r) - writeback_endio(0, wb); + if (wb_io(&io_req_w, 1, ®ion_w, NULL, false)) + writeback_endio(1, wb); } else { u8 i; for (i = 0; i < 8; i++) { @@ -167,21 +184,20 @@ continue; io_req_w = (struct dm_io_request) { + WB_IO_WRITE, .client = wb->io_client, - .bi_rw = WRITE, .notify.fn = writeback_endio, .notify.context = wb, .mem.type = DM_IO_VMA, - .mem.ptr.addr = writeback_io->data + (i << SECTOR_SHIFT), + .mem.ptr.addr = writeback_io->data + (i << 9), }; region_w = (struct dm_io_region) { .bdev = wb->backing_dev->bdev, .sector = writeback_io->sector + i, .count = 1, }; - maybe_IO(wb_io(&io_req_w, 1, ®ion_w, NULL, false)); - if (r) - writeback_endio(0, wb); + if (wb_io(&io_req_w, 1, ®ion_w, NULL, false)) + writeback_endio(1, wb); } } } @@ -209,8 +225,8 @@ */ static bool compare_writeback_io(struct writeback_io *a, struct writeback_io *b) { - BUG_ON(!a); - BUG_ON(!b); + ASSERT(a); + ASSERT(b); if (a->sector < b->sector) return true; if (a->id < b->id) @@ -254,20 +270,13 @@ rb_insert_color(&writeback_io->rb_node, &wb->writeback_tree); } -/* - * Read the data to writeback IOs and add them into the RB-tree to sort. - */ -static void prepare_writeback_ios(struct wb_device *wb, struct writeback_segment *writeback_seg, - size_t *writeback_io_count) +static int fill_writeback_seg(struct wb_device *wb, struct writeback_segment *writeback_seg) { - int r = 0; - u8 i; - struct segment_header *seg = writeback_seg->seg; struct dm_io_request io_req_r = { + WB_IO_READ, .client = wb->io_client, - .bi_rw = READ, .notify.fn = NULL, .mem.type = DM_IO_VMA, .mem.ptr.addr = writeback_seg->buf, @@ -282,14 +291,21 @@ * dm_io() allows region.count = 0 * so we don't need to skip here in case of seg->length = 0 */ - maybe_IO(wb_io(&io_req_r, 1, ®ion_r, NULL, false)); + return wb_io(&io_req_r, 1, ®ion_r, NULL, false); +} +static void prepare_writeback_ios(struct wb_device *wb, struct writeback_segment *writeback_seg, + size_t *writeback_io_count) +{ + struct segment_header *seg = writeback_seg->seg; + + u8 i; for (i = 0; i < seg->length; i++) { struct writeback_io *writeback_io; struct metablock *mb = seg->mb_array + i; struct dirtiness dirtiness = read_mb_dirtiness(wb, seg, mb); - BUG_ON(!dirtiness.data_bits); + ASSERT(dirtiness.data_bits > 0); if (!dirtiness.is_dirty) continue; @@ -304,7 +320,7 @@ } } -static void mark_clean_seg(struct wb_device *wb, struct segment_header *seg) +void mark_clean_seg(struct wb_device *wb, struct segment_header *seg) { u8 i; for (i = 0; i < seg->length; i++) { @@ -314,54 +330,66 @@ } } -static void do_writeback_segs(struct wb_device *wb) +/* + * Try writeback some specified segs and returns if all writeback ios succeeded. + */ +static bool try_writeback_segs(struct wb_device *wb) { - int r; - size_t k; struct writeback_segment *writeback_seg; - size_t writeback_io_count = 0; + u32 k; /* Create RB-tree */ wb->writeback_tree = RB_ROOT; - for (k = 0; k < wb->num_writeback_segs; k++) { + for (k = 0; k < wb->nr_cur_batched_writeback; k++) { writeback_seg = *(wb->writeback_segs + k); + + if (fill_writeback_seg(wb, writeback_seg)) + return false; + prepare_writeback_ios(wb, writeback_seg, &writeback_io_count); } + atomic_set(&wb->writeback_io_count, writeback_io_count); atomic_set(&wb->writeback_fail_count, 0); /* Pop rbnodes out of the tree and submit writeback I/Os */ submit_writeback_ios(wb); wait_event(wb->writeback_io_wait_queue, !atomic_read(&wb->writeback_io_count)); - if (atomic_read(&wb->writeback_fail_count)) - mark_dead(wb); - maybe_IO(blkdev_issue_flush(wb->backing_dev->bdev, GFP_NOIO, NULL)); - /* A segment after written back is clean */ - for (k = 0; k < wb->num_writeback_segs; k++) { - writeback_seg = *(wb->writeback_segs + k); - mark_clean_seg(wb, writeback_seg->seg); - } - atomic64_add(wb->num_writeback_segs, &wb->last_writeback_segment_id); + return atomic_read(&wb->writeback_fail_count) == 0; +} + +static bool do_writeback_segs(struct wb_device *wb) +{ + if (!try_writeback_segs(wb)) + return false; + + blkdev_issue_flush(wb->backing_dev->bdev, GFP_NOIO, NULL); + return true; } /* * Calculate the number of segments to write back. */ -static u32 calc_nr_writeback(struct wb_device *wb) +void update_nr_empty_segs(struct wb_device *wb) { - u32 nr_writeback_candidates, nr_max_batch; + wb->nr_empty_segs = + atomic64_read(&wb->last_writeback_segment_id) + wb->nr_segments + - wb->current_seg->id; +} - nr_writeback_candidates = atomic64_read(&wb->last_flushed_segment_id) - - atomic64_read(&wb->last_writeback_segment_id); - if (!nr_writeback_candidates) - return 0; +static u32 calc_nr_writeback(struct wb_device *wb) +{ + u32 nr_writeback_candidates = + atomic64_read(&wb->last_flushed_segment_id) + - atomic64_read(&wb->last_writeback_segment_id); - nr_max_batch = ACCESS_ONCE(wb->nr_max_batched_writeback); - if (wb->nr_cur_batched_writeback != nr_max_batch) + u32 nr_max_batch = ACCESS_ONCE(wb->nr_max_batched_writeback); + if (wb->nr_writeback_segs != nr_max_batch) try_alloc_writeback_ios(wb, nr_max_batch, GFP_NOIO | __GFP_NOWARN); - return min(nr_writeback_candidates, wb->nr_cur_batched_writeback); + + return min3(nr_writeback_candidates, wb->nr_writeback_segs, wb->nr_empty_segs + 1); } static bool should_writeback(struct wb_device *wb) @@ -373,29 +401,40 @@ static void do_writeback_proc(struct wb_device *wb) { - u32 k, nr_writeback; + u32 k, nr_writeback_tbd; if (!should_writeback(wb)) { schedule_timeout_interruptible(msecs_to_jiffies(1000)); return; } - nr_writeback = calc_nr_writeback(wb); - if (!nr_writeback) { + nr_writeback_tbd = calc_nr_writeback(wb); + if (!nr_writeback_tbd) { schedule_timeout_interruptible(msecs_to_jiffies(1000)); return; } + smp_rmb(); + /* Store segments into writeback_segs */ - for (k = 0; k < nr_writeback; k++) { + for (k = 0; k < nr_writeback_tbd; k++) { struct writeback_segment *writeback_seg = *(wb->writeback_segs + k); writeback_seg->seg = get_segment_header_by_id(wb, atomic64_read(&wb->last_writeback_segment_id) + 1 + k); } - wb->num_writeback_segs = nr_writeback; + wb->nr_cur_batched_writeback = nr_writeback_tbd; + + if (!do_writeback_segs(wb)) + return; - do_writeback_segs(wb); + /* A segment after written back is clean */ + for (k = 0; k < wb->nr_cur_batched_writeback; k++) { + struct writeback_segment *writeback_seg = *(wb->writeback_segs + k); + mark_clean_seg(wb, writeback_seg->seg); + } + smp_wmb(); + atomic64_add(wb->nr_cur_batched_writeback, &wb->last_writeback_segment_id); wake_up(&wb->writeback_wait_queue); } @@ -417,6 +456,7 @@ wake_up_process(wb->writeback_daemon); wait_event(wb->writeback_wait_queue, atomic64_read(&wb->last_writeback_segment_id) >= id); + smp_rmb(); wb->urge_writeback = false; } @@ -442,6 +482,8 @@ old = new; + update_nr_empty_segs(wb); + schedule_timeout_interruptible(msecs_to_jiffies(intvl)); } return 0; @@ -451,8 +493,6 @@ static void update_superblock_record(struct wb_device *wb) { - int r = 0; - struct superblock_record_device o; void *buf; struct dm_io_request io_req; @@ -466,8 +506,8 @@ memcpy(buf, &o, sizeof(o)); io_req = (struct dm_io_request) { + WB_IO_WRITE_FUA, .client = wb->io_client, - .bi_rw = WRITE_FUA, .notify.fn = NULL, .mem.type = DM_IO_KMEM, .mem.ptr.addr = buf, @@ -477,7 +517,7 @@ .sector = (1 << 11) - 1, .count = 1, }; - maybe_IO(wb_io(&io_req, 1, ®ion, NULL, false)); + wb_io(&io_req, 1, ®ion, NULL, false); mempool_free(buf, wb->buf_1_pool); } @@ -507,8 +547,6 @@ int data_synchronizer_proc(void *data) { - int r = 0; - struct wb_device *wb = data; unsigned long intvl; @@ -522,7 +560,7 @@ } flush_current_buffer(wb); - maybe_IO(blkdev_issue_flush(wb->cache_dev->bdev, GFP_NOIO, NULL)); + blkdev_issue_flush(wb->cache_dev->bdev, GFP_NOIO, NULL); schedule_timeout_interruptible(msecs_to_jiffies(intvl)); } return 0; diff -Nru dm-writeboost-2.1.1/src/dm-writeboost-daemon.h dm-writeboost-2.2.6/src/dm-writeboost-daemon.h --- dm-writeboost-2.1.1/src/dm-writeboost-daemon.h 2016-01-04 13:25:39.000000000 +0000 +++ dm-writeboost-2.2.6/src/dm-writeboost-daemon.h 2016-09-19 06:15:04.000000000 +0000 @@ -22,7 +22,7 @@ /*----------------------------------------------------------------------------*/ -void flush_proc(struct work_struct *); +int flush_daemon_proc(void *); void wait_for_flushing(struct wb_device *, u64 id); /*----------------------------------------------------------------------------*/ @@ -32,8 +32,10 @@ /*----------------------------------------------------------------------------*/ +void update_nr_empty_segs(struct wb_device *); int writeback_daemon_proc(void *); void wait_for_writeback(struct wb_device *, u64 id); +void mark_clean_seg(struct wb_device *, struct segment_header *seg); /*----------------------------------------------------------------------------*/ diff -Nru dm-writeboost-2.1.1/src/dm-writeboost.h dm-writeboost-2.2.6/src/dm-writeboost.h --- dm-writeboost-2.1.1/src/dm-writeboost.h 2016-01-04 13:25:39.000000000 +0000 +++ dm-writeboost-2.2.6/src/dm-writeboost.h 2016-09-19 06:15:04.000000000 +0000 @@ -104,7 +104,6 @@ */ /* - FROM ------------------------------------ */ __le64 id; - /* TODO Add timestamp? */ __le32 checksum; /* * The number of metablocks in this segment header to be considered in @@ -150,22 +149,12 @@ /*----------------------------------------------------------------------------*/ /* - * Foreground queues this object and flush daemon later pops one job to submit - * logging write to the cache device. - */ -struct flush_job { - struct work_struct work; - struct wb_device *wb; - struct segment_header *seg; - struct bio_list barrier_ios; /* List of deferred bios */ -}; - -/* * RAM buffer is a buffer that any dirty data are first written into. */ struct rambuffer { + struct segment_header *seg; void *data; - struct flush_job job; + struct bio_list barrier_ios; /* List of deferred bios */ }; /*----------------------------------------------------------------------------*/ @@ -245,13 +234,7 @@ #define STATLEN (1 << 4) enum WB_FLAG { - /* - * This flag is set when either one of the underlying devices returned - * EIO and we must immediately block up the whole to avoid further - * damage. - */ - WB_DEAD = 0, - WB_CREATED = 1, + WB_CREATED = 0, }; #define SEGMENT_SIZE_ORDER 10 @@ -266,7 +249,7 @@ struct dm_dev *backing_dev; /* Slow device (HDD) */ struct dm_dev *cache_dev; /* Fast device (SSD) */ - bool write_through_mode; + bool write_around_mode; unsigned nr_ctr_args; const char **ctr_args; @@ -336,23 +319,23 @@ struct rambuffer *rambuf_pool; + atomic64_t last_queued_segment_id; + /*--------------------------------------------------------------------*/ /******************** * One-shot Writeback ********************/ - wait_queue_head_t writeback_mb_wait_queue; struct dm_kcopyd_client *copier; /*--------------------------------------------------------------------*/ - /**************** - * Buffer Flusher - ****************/ + /************** + * Flush Daemon + **************/ - mempool_t *flush_job_pool; - struct workqueue_struct *flusher_wq; + struct task_struct *flush_daemon; /* * Wait for a specified segment to be flushed. Non-interruptible @@ -407,13 +390,15 @@ atomic_t writeback_io_count; atomic_t writeback_fail_count; - u32 nr_cur_batched_writeback; u32 nr_max_batched_writeback; /* Tunable */ + u32 nr_max_batched_writeback_saved; struct rb_root writeback_tree; - u32 num_writeback_segs; /* Number of segments to write back */ + u32 nr_writeback_segs; struct writeback_segment **writeback_segs; + u32 nr_cur_batched_writeback; /* Number of segments to be written back */ + u32 nr_empty_segs; /*--------------------------------------------------------------------*/ @@ -423,6 +408,7 @@ struct task_struct *writeback_modulator; u8 writeback_threshold; /* Tunable */ + u8 writeback_threshold_saved; /*--------------------------------------------------------------------*/ @@ -432,6 +418,7 @@ struct task_struct *sb_record_updater; unsigned long update_sb_record_interval; /* Tunable */ + unsigned long update_sb_record_interval_saved; /*--------------------------------------------------------------------*/ @@ -441,6 +428,7 @@ struct task_struct *data_synchronizer; unsigned long sync_data_interval; /* Tunable */ + unsigned long sync_data_interval_saved; /*--------------------------------------------------------------------*/ @@ -448,9 +436,12 @@ * Read Caching **************/ + u32 nr_read_cache_cells; + u32 nr_read_cache_cells_saved; struct work_struct read_cache_work; struct read_cache_cells *read_cache_cells; u32 read_cache_threshold; /* Tunable */ + u32 read_cache_threshold_saved; /*--------------------------------------------------------------------*/ @@ -468,6 +459,11 @@ /*----------------------------------------------------------------------------*/ +struct write_io { + void *data; /* 4KB */ + u8 data_bits; +}; + void acquire_new_seg(struct wb_device *, u64 id); void cursor_init(struct wb_device *); void flush_current_buffer(struct wb_device *); @@ -475,10 +471,12 @@ void dec_nr_dirty_caches(struct wb_device *); bool mark_clean_mb(struct wb_device *, struct metablock *); struct dirtiness read_mb_dirtiness(struct wb_device *, struct segment_header *, struct metablock *); -void prepare_overwrite(struct wb_device *, struct segment_header *, struct metablock *old_mb, bool overwrite_fullsize); +int prepare_overwrite(struct wb_device *, struct segment_header *, struct metablock *old_mb, struct write_io *, u8 overwrite_bits); /*----------------------------------------------------------------------------*/ +#define ASSERT(cond) BUG_ON(!(cond)) + #define check_buffer_alignment(buf) \ do_check_buffer_alignment(buf, #buf, __func__) void do_check_buffer_alignment(void *, const char *, const char *); @@ -500,45 +498,21 @@ /*----------------------------------------------------------------------------*/ -/* - * Device blockup (Marking the device as dead) - * ------------------------------------------- - * - * I/O error on cache device blocks up the whole system. - * After the system is blocked up, cache device is dead, all I/Os to cache - * device are ignored as if it becomes /dev/null. - */ -#define mark_dead(wb) set_bit(WB_DEAD, &wb->flags) -#define is_live(wb) likely(!test_bit(WB_DEAD, &wb->flags)) - -/* - * This macro wraps I/Os to cache device to add context of failure. - */ -#define maybe_IO(proc) \ - do { \ - r = 0; \ - if (is_live(wb)) {\ - r = proc; \ - } else { \ - r = -EIO; \ - break; \ - } \ - \ - if (r == -EIO) { \ - mark_dead(wb); \ - DMERR("device is marked as dead"); \ - break; \ - } else if (r == -ENOMEM) { \ - DMERR("I/O failed by ENOMEM"); \ - schedule_timeout_interruptible(msecs_to_jiffies(1000));\ - continue; \ - } else if (r == -EOPNOTSUPP) { \ - break; \ - } else if (r) { \ - WARN_ONCE(1, "I/O failed for unknown reason err(%d)", r); \ - break; \ - } \ - } while (r) +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,8,0) +#define req_is_write(req) op_is_write((req)->bi_op) +#define bio_is_barrier(bio) ((bio)->bi_opf & REQ_PREFLUSH) +#define bio_is_fua(bio) ((bio)->bi_opf & REQ_FUA) +#define WB_IO_WRITE .bi_op = REQ_OP_WRITE, .bi_op_flags = 0 +#define WB_IO_READ .bi_op = REQ_OP_READ, .bi_op_flags = 0 +#define WB_IO_WRITE_FUA .bi_op = REQ_OP_WRITE, .bi_op_flags = REQ_FUA +#else +#define req_is_write(req) ((req)->bi_rw == WRITE) +#define bio_is_barrier(bio) ((bio)->bi_rw & REQ_FLUSH) +#define bio_is_fua(bio) ((bio)->bi_rw & REQ_FUA) +#define WB_IO_WRITE .bi_rw = WRITE +#define WB_IO_READ .bi_rw = READ +#define WB_IO_WRITE_FUA .bi_rw = WRITE_FUA +#endif /*----------------------------------------------------------------------------*/ diff -Nru dm-writeboost-2.1.1/src/dm-writeboost-metadata.c dm-writeboost-2.2.6/src/dm-writeboost-metadata.c --- dm-writeboost-2.1.1/src/dm-writeboost-metadata.c 2016-01-04 13:25:39.000000000 +0000 +++ dm-writeboost-2.2.6/src/dm-writeboost-metadata.c 2016-09-19 06:15:04.000000000 +0000 @@ -249,7 +249,7 @@ struct ht_head *ht_get_head(struct wb_device *wb, struct lookup_key *key) { u32 idx; - div_u64_rem(key->sector, wb->htsize, &idx); + div_u64_rem(key->sector >> 3, wb->htsize, &idx); return large_array_at(wb->htable, idx); } @@ -277,6 +277,7 @@ hlist_del(&mb->ht_list); hlist_add_head(&mb->ht_list, &head->ht_list); + BUG_ON(key->sector & 7); // should be 4KB aligned mb->sector = key->sector; }; @@ -310,7 +311,7 @@ static int read_superblock_header(struct superblock_header_device *sup, struct wb_device *wb) { - int r = 0; + int err = 0; struct dm_io_request io_req_sup; struct dm_io_region region_sup; @@ -320,8 +321,8 @@ check_buffer_alignment(buf); io_req_sup = (struct dm_io_request) { + WB_IO_READ, .client = wb->io_client, - .bi_rw = READ, .notify.fn = NULL, .mem.type = DM_IO_KMEM, .mem.ptr.addr = buf, @@ -331,15 +332,15 @@ .sector = 0, .count = 1, }; - r = wb_io(&io_req_sup, 1, ®ion_sup, NULL, false); - if (r) + err = wb_io(&io_req_sup, 1, ®ion_sup, NULL, false); + if (err) goto bad_io; memcpy(sup, buf, sizeof(*sup)); bad_io: mempool_free(buf, wb->buf_1_pool); - return r; + return err; } /* @@ -348,27 +349,28 @@ */ static int audit_cache_device(struct wb_device *wb) { - int r = 0; + int err = 0; struct superblock_header_device sup; - r = read_superblock_header(&sup, wb); - if (r) { + err = read_superblock_header(&sup, wb); + if (err) { DMERR("read_superblock_header failed"); - return r; + return err; } wb->do_format = false; - if (le32_to_cpu(sup.magic) != WB_MAGIC) { + if (le32_to_cpu(sup.magic) != WB_MAGIC || + wb->write_around_mode) { /* write-around mode should discard all caches */ wb->do_format = true; DMERR("Superblock Header: Magic number invalid"); return 0; } - return r; + return err; } static int format_superblock_header(struct wb_device *wb) { - int r = 0; + int err = 0; struct dm_io_request io_req_sup; struct dm_io_region region_sup; @@ -384,8 +386,8 @@ memcpy(buf, &sup, sizeof(sup)); io_req_sup = (struct dm_io_request) { + WB_IO_WRITE_FUA, .client = wb->io_client, - .bi_rw = WRITE_FUA, .notify.fn = NULL, .mem.type = DM_IO_KMEM, .mem.ptr.addr = buf, @@ -395,13 +397,13 @@ .sector = 0, .count = 1, }; - r = wb_io(&io_req_sup, 1, ®ion_sup, NULL, false); - if (r) + err = wb_io(&io_req_sup, 1, ®ion_sup, NULL, false); + if (err) goto bad_io; bad_io: mempool_free(buf, wb->buf_1_pool); - return r; + return err; } struct format_segmd_context { @@ -435,13 +437,13 @@ */ static int do_zeroing_region(struct wb_device *wb, struct dm_io_region *region) { - int r; + int err; struct zeroing_context zc; zc.error = 0; init_completion(&zc.complete); - r = dm_kcopyd_zero(wb->copier, 1, region, 0, zeroing_complete, &zc); - if (r) - return r; + err = dm_kcopyd_zero(wb->copier, 1, region, 0, zeroing_complete, &zc); + if (err) + return err; wait_for_completion(&zc.complete); return zc.error; } @@ -458,7 +460,7 @@ static int format_all_segment_headers(struct wb_device *wb) { - int r = 0; + int err = 0; struct dm_dev *dev = wb->cache_dev; u32 i; @@ -473,13 +475,11 @@ atomic64_set(&context.count, wb->nr_segments); context.err = 0; - /* - * Submit all the writes asynchronously. - */ + /* Submit all the writes asynchronously. */ for (i = 0; i < wb->nr_segments; i++) { struct dm_io_request io_req_seg = { + WB_IO_WRITE, .client = wb->io_client, - .bi_rw = WRITE, .notify.fn = format_segmd_endio, .notify.context = &context, .mem.type = DM_IO_KMEM, @@ -490,12 +490,12 @@ .sector = calc_segment_header_start(wb, i), .count = (1 << 3), }; - r = wb_io(&io_req_seg, 1, ®ion_seg, NULL, false); - if (r) + err = wb_io(&io_req_seg, 1, ®ion_seg, NULL, false); + if (err) break; } - if (r) + if (err) goto bad; /* Wait for all the writes complete. */ @@ -504,12 +504,15 @@ if (context.err) { DMERR("I/O failed"); - r = -EIO; + err = -EIO; + goto bad; } + err = blkdev_issue_flush(dev->bdev, GFP_KERNEL, NULL); + bad: mempool_free(buf, wb->buf_8_pool); - return r; + return err; } /* @@ -517,27 +520,22 @@ */ static int format_cache_device(struct wb_device *wb) { - int r = 0; - struct dm_dev *dev = wb->cache_dev; - - r = zeroing_full_superblock(wb); - if (r) { + int err = zeroing_full_superblock(wb); + if (err) { DMERR("zeroing_full_superblock failed"); - return r; + return err; } - r = format_superblock_header(wb); /* First 512B */ - if (r) { - DMERR("format_superblock_header failed"); - return r; - } - r = format_all_segment_headers(wb); - if (r) { + err = format_all_segment_headers(wb); + if (err) { DMERR("format_all_segment_headers failed"); - return r; + return err; } - r = blkdev_issue_flush(dev->bdev, GFP_KERNEL, NULL); - - return r; + err = format_superblock_header(wb); /* First 512B */ + if (err) { + DMERR("format_superblock_header failed"); + return err; + } + return err; } /* @@ -548,30 +546,30 @@ */ static int might_format_cache_device(struct wb_device *wb) { - int r = 0; + int err = 0; - r = audit_cache_device(wb); - if (r) { + err = audit_cache_device(wb); + if (err) { DMERR("audit_cache_device failed"); - return r; + return err; } if (wb->do_format) { - r = format_cache_device(wb); - if (r) { + err = format_cache_device(wb); + if (err) { DMERR("format_cache_device failed"); - return r; + return err; } } - return r; + return err; } /*----------------------------------------------------------------------------*/ static int init_rambuf_pool(struct wb_device *wb) { - int r = 0; + int err = 0; size_t i; wb->rambuf_pool = kmalloc(sizeof(struct rambuffer) * NR_RAMBUF_POOL, GFP_KERNEL); @@ -579,24 +577,24 @@ return -ENOMEM; for (i = 0; i < NR_RAMBUF_POOL; i++) { - void *alloced = vmalloc(1 << (SEGMENT_SIZE_ORDER + SECTOR_SHIFT)); + void *alloced = vmalloc(1 << (SEGMENT_SIZE_ORDER + 9)); if (!alloced) { size_t j; DMERR("Failed to allocate rambuf->data"); for (j = 0; j < i; j++) { vfree(wb->rambuf_pool[j].data); } - r = -ENOMEM; + err = -ENOMEM; goto bad_alloc_data; } wb->rambuf_pool[i].data = alloced; } - return r; + return err; bad_alloc_data: kfree(wb->rambuf_pool); - return r; + return err; } static void free_rambuf_pool(struct wb_device *wb) @@ -607,6 +605,13 @@ kfree(wb->rambuf_pool); } +struct rambuffer *get_rambuffer_by_id(struct wb_device *wb, u64 id) +{ + u32 tmp32; + div_u64_rem(id - 1, NR_RAMBUF_POOL, &tmp32); + return wb->rambuf_pool + tmp32; +} + /*----------------------------------------------------------------------------*/ /* @@ -616,19 +621,19 @@ */ static int init_devices(struct wb_device *wb) { - int r = 0; + int err = 0; - r = might_format_cache_device(wb); - if (r) - return r; + err = might_format_cache_device(wb); + if (err) + return err; - r = init_rambuf_pool(wb); - if (r) { + err = init_rambuf_pool(wb); + if (err) { DMERR("init_rambuf_pool failed"); - return r; + return err; } - return r; + return err; } static void free_devices(struct wb_device *wb) @@ -641,7 +646,7 @@ static int read_superblock_record(struct superblock_record_device *record, struct wb_device *wb) { - int r = 0; + int err = 0; struct dm_io_request io_req; struct dm_io_region region; @@ -652,8 +657,8 @@ check_buffer_alignment(buf); io_req = (struct dm_io_request) { + WB_IO_READ, .client = wb->io_client, - .bi_rw = READ, .notify.fn = NULL, .mem.type = DM_IO_KMEM, .mem.ptr.addr = buf, @@ -663,15 +668,15 @@ .sector = (1 << 11) - 1, .count = 1, }; - r = wb_io(&io_req, 1, ®ion, NULL, false); - if (r) + err = wb_io(&io_req, 1, ®ion, NULL, false); + if (err) goto bad_io; memcpy(record, buf, sizeof(*record)); bad_io: mempool_free(buf, wb->buf_1_pool); - return r; + return err; } /* @@ -681,8 +686,8 @@ struct segment_header *seg) { struct dm_io_request io_req = { + WB_IO_READ, .client = wb->io_client, - .bi_rw = READ, .notify.fn = NULL, .mem.type = DM_IO_VMA, .mem.ptr.addr = buf, @@ -712,7 +717,7 @@ struct segment_header_device *dest = rambuffer; u32 i; - BUG_ON((src->length) != (wb->cursor - src->start_idx)); + ASSERT((src->length) == (wb->cursor - src->start_idx)); for (i = 0; i < src->length; i++) { struct metablock *mb = src->mb_array + i; @@ -732,8 +737,8 @@ /* * Apply @i-th metablock in @src to @seg */ -static void apply_metablock_device(struct wb_device *wb, struct segment_header *seg, - struct segment_header_device *src, u8 i) +static int apply_metablock_device(struct wb_device *wb, struct segment_header *seg, + struct segment_header_device *src, u8 i) { struct lookup_key key; struct ht_head *head; @@ -751,23 +756,70 @@ head = ht_get_head(wb, &key); found = ht_lookup(wb, head, &key); if (found) { - bool overwrite_fullsize = (mb->dirtiness.data_bits == 255); - prepare_overwrite(wb, mb_to_seg(wb, found), found, overwrite_fullsize); + int err = 0; + u8 i; + struct write_io wio; + void *buf = mempool_alloc(wb->buf_8_pool, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + wio = (struct write_io) { + .data = buf, + .data_bits = 0, + }; + err = prepare_overwrite(wb, mb_to_seg(wb, found), found, &wio, mb->dirtiness.data_bits); + if (err) + goto fail_out; + + for (i = 0; i < 8; i++) { + struct dm_io_request io_req; + struct dm_io_region region; + if (!(wio.data_bits & (1 << i))) + continue; + + io_req = (struct dm_io_request) { + WB_IO_WRITE, + .client = wb->io_client, + .notify.fn = NULL, + .mem.type = DM_IO_KMEM, + .mem.ptr.addr = wio.data + (i << 9), + }; + region = (struct dm_io_region) { + .bdev = wb->backing_dev->bdev, + .sector = mb->sector + i, + .count = 1, + }; + err = wb_io(&io_req, 1, ®ion, NULL, true); + if (err) + break; + } + +fail_out: + mempool_free(buf, wb->buf_8_pool); + if (err) + return err; } ht_register(wb, head, mb, &key); if (mb->dirtiness.is_dirty) inc_nr_dirty_caches(wb); + + return 0; } -static void apply_segment_header_device(struct wb_device *wb, struct segment_header *seg, - struct segment_header_device *src) +static int apply_segment_header_device(struct wb_device *wb, struct segment_header *seg, + struct segment_header_device *src) { + int err = 0; u8 i; seg->length = src->length; - for (i = 0; i < src->length; i++) - apply_metablock_device(wb, seg, src, i); + for (i = 0; i < src->length; i++) { + err = apply_metablock_device(wb, seg, src, i); + if (err) + break; + } + return err; } /* @@ -777,8 +829,8 @@ struct segment_header *seg) { struct dm_io_request io_req = { + WB_IO_READ, .client = wb->io_client, - .bi_rw = READ, .notify.fn = NULL, .mem.type = DM_IO_KMEM, .mem.ptr.addr = buf, @@ -797,7 +849,7 @@ */ static int do_find_max_id(struct wb_device *wb, u64 *max_id) { - int r = 0; + int err = 0; u32 k; void *buf = mempool_alloc(wb->buf_8_pool, GFP_KERNEL); @@ -809,18 +861,17 @@ for (k = 0; k < wb->nr_segments; k++) { struct segment_header *seg = segment_at(wb, k); struct segment_header_device *header; - r = read_segment_header(buf, wb, seg); - if (r) { - kfree(buf); - return r; - } + err = read_segment_header(buf, wb, seg); + if (err) + goto out; header = buf; if (le64_to_cpu(header->id) > *max_id) *max_id = le64_to_cpu(header->id); } +out: mempool_free(buf, wb->buf_8_pool); - return r; + return err; } static int find_max_id(struct wb_device *wb, u64 *max_id) @@ -850,12 +901,12 @@ */ static int do_apply_valid_segments(struct wb_device *wb, u64 *max_id) { - int r = 0; + int err = 0; struct segment_header *seg; struct segment_header_device *header; u32 i, start_idx; - void *rambuf = vmalloc(1 << (SEGMENT_SIZE_ORDER + SECTOR_SHIFT)); + void *rambuf = vmalloc(1 << (SEGMENT_SIZE_ORDER + 9)); if (!rambuf) return -ENOMEM; @@ -871,8 +922,8 @@ div_u64_rem(i, wb->nr_segments, &k); seg = segment_at(wb, k); - r = read_whole_segment(rambuf, wb, seg); - if (r) + err = read_whole_segment(rambuf, wb, seg); + if (err) break; header = rambuf; @@ -900,12 +951,15 @@ } /* This segment is correct and we apply */ - apply_segment_header_device(wb, seg, header); + err = apply_segment_header_device(wb, seg, header); + if (err) + break; + *max_id = le64_to_cpu(header->id); } vfree(rambuf); - return r; + return err; } static int apply_valid_segments(struct wb_device *wb, u64 *max_id) @@ -923,17 +977,18 @@ static int infer_last_writeback_id(struct wb_device *wb) { - int r = 0; + int err = 0; + u64 inferred_last_writeback_id; u64 record_id; + struct superblock_record_device uninitialized_var(record); - r = read_superblock_record(&record, wb); - if (r) - return r; - - atomic64_set(&wb->last_writeback_segment_id, - atomic64_read(&wb->last_flushed_segment_id) > wb->nr_segments ? - atomic64_read(&wb->last_flushed_segment_id) - wb->nr_segments : 0); + err = read_superblock_record(&record, wb); + if (err) + return err; + + inferred_last_writeback_id = + SUB_ID(atomic64_read(&wb->last_flushed_segment_id), wb->nr_segments); /* * If last_writeback_id is recorded on the super block @@ -941,10 +996,15 @@ * written back before. */ record_id = le64_to_cpu(record.last_writeback_segment_id); - if (record_id > atomic64_read(&wb->last_writeback_segment_id)) - atomic64_set(&wb->last_writeback_segment_id, record_id); + if (record_id > inferred_last_writeback_id) { + u64 id; + for (id = inferred_last_writeback_id + 1; id <= record_id; id++) + mark_clean_seg(wb, get_segment_header_by_id(wb, id)); + inferred_last_writeback_id = record_id; + } - return r; + atomic64_set(&wb->last_writeback_segment_id, inferred_last_writeback_id); + return err; } /* @@ -961,28 +1021,31 @@ */ static int replay_log_on_cache(struct wb_device *wb) { - int r = 0; + int err = 0; u64 max_id; - r = find_max_id(wb, &max_id); - if (r) { + err = find_max_id(wb, &max_id); + if (err) { DMERR("find_max_id failed"); - return r; + return err; } - r = apply_valid_segments(wb, &max_id); - if (r) { + err = apply_valid_segments(wb, &max_id); + if (err) { DMERR("apply_valid_segments failed"); - return r; + return err; } /* Setup last_flushed_segment_id */ atomic64_set(&wb->last_flushed_segment_id, max_id); + /* Setup last_queued_segment_id */ + atomic64_set(&wb->last_queued_segment_id, max_id); + /* Setup last_writeback_segment_id */ infer_last_writeback_id(wb); - return r; + return err; } /* @@ -1000,12 +1063,12 @@ */ static int recover_cache(struct wb_device *wb) { - int r = 0; + int err = 0; - r = replay_log_on_cache(wb); - if (r) { + err = replay_log_on_cache(wb); + if (err) { DMERR("replay_log_on_cache failed"); - return r; + return err; } prepare_first_seg(wb); @@ -1026,7 +1089,7 @@ if (!writeback_seg->ios) goto bad_ios; - writeback_seg->buf = vmalloc((1 << (SEGMENT_SIZE_ORDER + SECTOR_SHIFT)) - (1 << 12)); + writeback_seg->buf = vmalloc((1 << (SEGMENT_SIZE_ORDER + 9)) - (1 << 12)); if (!writeback_seg->buf) goto bad_buf; @@ -1073,7 +1136,7 @@ */ int try_alloc_writeback_ios(struct wb_device *wb, size_t nr_batch, gfp_t gfp) { - int r = 0; + int err = 0; size_t i; struct writeback_segment **writeback_segs = kzalloc( @@ -1104,9 +1167,9 @@ /* And then swap by new values */ wb->writeback_segs = writeback_segs; - wb->nr_cur_batched_writeback = nr_batch; + wb->nr_writeback_segs = nr_batch; - return r; + return err; } /*----------------------------------------------------------------------------*/ @@ -1116,7 +1179,7 @@ wb->name = kthread_create( \ name##_proc, wb, "dmwb_" #name); \ if (IS_ERR(wb->name)) { \ - r = PTR_ERR(wb->name); \ + err = PTR_ERR(wb->name); \ wb->name = NULL; \ DMERR("couldn't spawn " #name); \ goto bad_##name; \ @@ -1134,26 +1197,26 @@ */ static int init_metadata(struct wb_device *wb) { - int r = 0; + int err = 0; - r = init_segment_header_array(wb); - if (r) { + err = init_segment_header_array(wb); + if (err) { DMERR("init_segment_header_array failed"); goto bad_alloc_segment_header_array; } - r = ht_empty_init(wb); - if (r) { + err = ht_empty_init(wb); + if (err) { DMERR("ht_empty_init failed"); goto bad_alloc_ht; } - return r; + return err; bad_alloc_ht: free_segment_header_array(wb); bad_alloc_segment_header_array: - return r; + return err; } static void free_metadata(struct wb_device *wb) @@ -1164,13 +1227,13 @@ static int init_writeback_daemon(struct wb_device *wb) { - int r = 0; + int err = 0; size_t nr_batch; atomic_set(&wb->writeback_fail_count, 0); atomic_set(&wb->writeback_io_count, 0); - nr_batch = 8; + nr_batch = 32; wb->nr_max_batched_writeback = nr_batch; if (try_alloc_writeback_ios(wb, nr_batch, GFP_KERNEL)) return -ENOMEM; @@ -1184,23 +1247,22 @@ wb->force_drop = false; CREATE_DAEMON(writeback_daemon); - return r; + return err; bad_writeback_daemon: free_writeback_ios(wb); - return r; + return err; } -static int init_flusher(struct wb_device *wb) +static int init_flush_daemon(struct wb_device *wb) { - wb->flusher_wq = create_singlethread_workqueue("dmwb_flusher"); - if (!wb->flusher_wq) { - DMERR("Failed to allocate flusher_wq"); - return -ENOMEM; - } - + int err = 0; init_waitqueue_head(&wb->flush_wait_queue); - return 0; + CREATE_DAEMON(flush_daemon); + return err; + +bad_flush_daemon: + return err; } static int init_flush_barrier_work(struct wb_device *wb) @@ -1217,96 +1279,96 @@ static int init_writeback_modulator(struct wb_device *wb) { - int r = 0; + int err = 0; wb->writeback_threshold = 0; CREATE_DAEMON(writeback_modulator); - return r; + return err; bad_writeback_modulator: - return r; + return err; } static int init_sb_record_updater(struct wb_device *wb) { - int r = 0; + int err = 0; wb->update_sb_record_interval = 0; CREATE_DAEMON(sb_record_updater); - return r; + return err; bad_sb_record_updater: - return r; + return err; } static int init_data_synchronizer(struct wb_device *wb) { - int r = 0; + int err = 0; wb->sync_data_interval = 0; CREATE_DAEMON(data_synchronizer); - return r; + return err; bad_data_synchronizer: - return r; + return err; } int resume_cache(struct wb_device *wb) { - int r = 0; + int err = 0; wb->nr_segments = calc_nr_segments(wb->cache_dev, wb); wb->nr_caches_inseg = (1 << (SEGMENT_SIZE_ORDER - 3)) - 1; wb->nr_caches = wb->nr_segments * wb->nr_caches_inseg; - r = init_devices(wb); - if (r) + err = init_devices(wb); + if (err) goto bad_devices; - r = init_metadata(wb); - if (r) + err = init_metadata(wb); + if (err) goto bad_metadata; - r = init_writeback_daemon(wb); - if (r) { + err = init_writeback_daemon(wb); + if (err) { DMERR("init_writeback_daemon failed"); goto bad_writeback_daemon; } - r = recover_cache(wb); - if (r) { + err = recover_cache(wb); + if (err) { DMERR("recover_cache failed"); goto bad_recover; } - r = init_flusher(wb); - if (r) { - DMERR("init_flusher failed"); - goto bad_flusher; + err = init_flush_daemon(wb); + if (err) { + DMERR("init_flush_daemon failed"); + goto bad_flush_daemon; } - r = init_flush_barrier_work(wb); - if (r) { + err = init_flush_barrier_work(wb); + if (err) { DMERR("init_flush_barrier_work failed"); goto bad_flush_barrier_work; } - r = init_writeback_modulator(wb); - if (r) { + err = init_writeback_modulator(wb); + if (err) { DMERR("init_writeback_modulator failed"); goto bad_modulator; } - r = init_sb_record_updater(wb); - if (r) { + err = init_sb_record_updater(wb); + if (err) { DMERR("init_sb_recorder failed"); goto bad_updater; } - r = init_data_synchronizer(wb); - if (r) { + err = init_data_synchronizer(wb); + if (err) { DMERR("init_data_synchronizer failed"); goto bad_synchronizer; } - return r; + return err; bad_synchronizer: kthread_stop(wb->sb_record_updater); @@ -1315,8 +1377,8 @@ bad_modulator: destroy_workqueue(wb->barrier_wq); bad_flush_barrier_work: - destroy_workqueue(wb->flusher_wq); -bad_flusher: + kthread_stop(wb->flush_daemon); +bad_flush_daemon: bad_recover: kthread_stop(wb->writeback_daemon); free_writeback_ios(wb); @@ -1325,7 +1387,7 @@ bad_metadata: free_devices(wb); bad_devices: - return r; + return err; } void free_cache(struct wb_device *wb) @@ -1340,7 +1402,7 @@ destroy_workqueue(wb->barrier_wq); - destroy_workqueue(wb->flusher_wq); + kthread_stop(wb->flush_daemon); kthread_stop(wb->writeback_daemon); free_writeback_ios(wb); diff -Nru dm-writeboost-2.1.1/src/dm-writeboost-metadata.h dm-writeboost-2.2.6/src/dm-writeboost-metadata.h --- dm-writeboost-2.1.1/src/dm-writeboost-metadata.h 2016-01-04 13:25:39.000000000 +0000 +++ dm-writeboost-2.2.6/src/dm-writeboost-metadata.h 2016-09-19 06:15:04.000000000 +0000 @@ -24,6 +24,7 @@ struct segment_header * get_segment_header_by_id(struct wb_device *, u64 segment_id); +struct rambuffer *get_rambuffer_by_id(struct wb_device *wb, u64 id); sector_t calc_mb_start_sector(struct wb_device *, struct segment_header *, u32 mb_idx); u8 mb_idx_inseg(struct wb_device *, u32 mb_idx); diff -Nru dm-writeboost-2.1.1/src/dm-writeboost-target.c dm-writeboost-2.2.6/src/dm-writeboost-target.c --- dm-writeboost-2.1.1/src/dm-writeboost-target.c 2016-01-04 13:25:39.000000000 +0000 +++ dm-writeboost-2.2.6/src/dm-writeboost-target.c 2016-09-19 06:15:04.000000000 +0000 @@ -28,28 +28,18 @@ /*----------------------------------------------------------------------------*/ -void bio_endio_compat(struct bio *bio, int error) -{ -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,3,0) - bio->bi_error = error; - bio_endio(bio); -#else - bio_endio(bio, error); -#endif -} - -/*----------------------------------------------------------------------------*/ - void do_check_buffer_alignment(void *buf, const char *name, const char *caller) { unsigned long addr = (unsigned long) buf; - if (!IS_ALIGNED(addr, 1 << SECTOR_SHIFT)) { + if (!IS_ALIGNED(addr, 1 << 9)) { DMCRIT("@%s in %s is not sector-aligned. I/O buffer must be sector-aligned.", name, caller); BUG(); } } +/*----------------------------------------------------------------------------*/ + struct wb_io { struct work_struct work; int err; @@ -78,7 +68,7 @@ .regions = regions, .num_regions = num_regions, }; - BUG_ON(io_req->notify.fn); + ASSERT(io_req->notify.fn == NULL); INIT_WORK_ONSTACK(&io.work, wb_io_fn); queue_work(wb->io_wq, &io.work); @@ -92,9 +82,7 @@ err = dm_io(io_req, num_regions, regions, err_bits); } - /* - * err_bits can be NULL. - */ + /* err_bits can be NULL. */ if (err || (err_bits && *err_bits)) { char buf[BDEVNAME_SIZE]; dev_t dev = regions->bdev->bd_dev; @@ -106,9 +94,10 @@ eb = *err_bits; format_dev_t(buf, dev); - DMERR("%s() I/O error(%d), bits(%lu), dev(%s), sector(%llu), rw(%d)", + DMERR("%s() I/O error(%d), bits(%lu), dev(%s), sector(%llu), %s", caller, err, eb, - buf, (unsigned long long) regions->sector, io_req->bi_rw); + buf, (unsigned long long) regions->sector, + req_is_write(io_req) ? "write" : "read"); } return err; @@ -116,17 +105,25 @@ sector_t dm_devsize(struct dm_dev *dev) { - return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT; + return i_size_read(dev->bdev->bd_inode) >> 9; } /*----------------------------------------------------------------------------*/ -#if LINUX_VERSION_CODE <= KERNEL_VERSION(3,14,0) -#define bi_sector(bio) (bio)->bi_sector -#define bi_size(bio) (bio)->bi_size +void bio_endio_compat(struct bio *bio, int error) +{ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,3,0) + bio->bi_error = error; + bio_endio(bio); #else + bio_endio(bio, error); +#endif +} + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,14,0) #define bi_sector(bio) (bio)->bi_iter.bi_sector -#define bi_size(bio) (bio)->bi_iter.bi_size +#else +#define bi_sector(bio) (bio)->bi_sector #endif static void bio_remap(struct bio *bio, struct dm_dev *dev, sector_t sector) @@ -135,24 +132,24 @@ bi_sector(bio) = sector; } -static u8 do_io_offset(sector_t sector) +static u8 calc_offset(sector_t sector) { u32 tmp32; div_u64_rem(sector, 1 << 3, &tmp32); return tmp32; } -static u8 io_offset(struct bio *bio) +static u8 bio_calc_offset(struct bio *bio) { - return do_io_offset(bi_sector(bio)); + return calc_offset(bi_sector(bio)); } -static bool io_fullsize(struct bio *bio) +static bool bio_is_fullsize(struct bio *bio) { return bio_sectors(bio) == (1 << 3); } -static bool io_write(struct bio *bio) +static bool bio_is_write(struct bio *bio) { return bio_data_dir(bio) == WRITE; } @@ -193,14 +190,114 @@ return count; } +void inc_nr_dirty_caches(struct wb_device *wb) +{ + ASSERT(wb); + atomic64_inc(&wb->nr_dirty_caches); +} + +void dec_nr_dirty_caches(struct wb_device *wb) +{ + ASSERT(wb); + if (atomic64_dec_and_test(&wb->nr_dirty_caches)) + wake_up_interruptible(&wb->wait_drop_caches); +} + +static bool taint_mb(struct wb_device *wb, struct metablock *mb, u8 data_bits) +{ + unsigned long flags; + bool flipped = false; + + ASSERT(data_bits > 0); + spin_lock_irqsave(&wb->mb_lock, flags); + if (!mb->dirtiness.is_dirty) { + mb->dirtiness.is_dirty = true; + flipped = true; + } + mb->dirtiness.data_bits |= data_bits; + spin_unlock_irqrestore(&wb->mb_lock, flags); + + return flipped; +} + +bool mark_clean_mb(struct wb_device *wb, struct metablock *mb) +{ + unsigned long flags; + bool flipped = false; + + spin_lock_irqsave(&wb->mb_lock, flags); + if (mb->dirtiness.is_dirty) { + mb->dirtiness.is_dirty = false; + flipped = true; + } + spin_unlock_irqrestore(&wb->mb_lock, flags); + + return flipped; +} + +/* + * Read the dirtiness of a metablock at the moment. + */ +struct dirtiness read_mb_dirtiness(struct wb_device *wb, struct segment_header *seg, + struct metablock *mb) +{ + unsigned long flags; + struct dirtiness retval; + + spin_lock_irqsave(&wb->mb_lock, flags); + retval = mb->dirtiness; + spin_unlock_irqrestore(&wb->mb_lock, flags); + + return retval; +} + +/*----------------------------------------------------------------------------*/ + +void cursor_init(struct wb_device *wb) +{ + wb->cursor = wb->current_seg->start_idx; + wb->current_seg->length = 0; +} + /* - * Prepare the RAM buffer for segment write. + * Advance the cursor and return the old cursor. + * After returned, nr_inflight_ios is incremented to wait for this write to complete. */ +static u32 advance_cursor(struct wb_device *wb) +{ + u32 old; + if (wb->cursor == wb->nr_caches) + wb->cursor = 0; + old = wb->cursor; + wb->cursor++; + wb->current_seg->length++; + BUG_ON(wb->current_seg->length > wb->nr_caches_inseg); + atomic_inc(&wb->current_seg->nr_inflight_ios); + return old; +} + +static bool needs_queue_seg(struct wb_device *wb) +{ + bool rambuf_no_space = !mb_idx_inseg(wb, wb->cursor); + return rambuf_no_space; +} + +/*----------------------------------------------------------------------------*/ + +static void copy_barrier_requests(struct rambuffer *rambuf, struct wb_device *wb) +{ + bio_list_init(&rambuf->barrier_ios); + bio_list_merge(&rambuf->barrier_ios, &wb->barrier_ios); + bio_list_init(&wb->barrier_ios); +} + static void prepare_rambuffer(struct rambuffer *rambuf, struct wb_device *wb, struct segment_header *seg) { + rambuf->seg = seg; prepare_segment_header_device(rambuf->data, wb, seg); + copy_barrier_requests(rambuf, wb); } static void init_rambuffer(struct wb_device *wb) @@ -211,27 +308,16 @@ /* * Acquire a new RAM buffer for the new segment. */ -static void acquire_new_rambuffer(struct wb_device *wb, u64 id) +static void __acquire_new_rambuffer(struct wb_device *wb, u64 id) { - struct rambuffer *next_rambuf; - u32 tmp32; - wait_for_flushing(wb, SUB_ID(id, NR_RAMBUF_POOL)); - div_u64_rem(id - 1, NR_RAMBUF_POOL, &tmp32); - next_rambuf = wb->rambuf_pool + tmp32; - - wb->current_rambuf = next_rambuf; + wb->current_rambuf = get_rambuffer_by_id(wb, id); init_rambuffer(wb); } -/* - * Acquire the new segment and RAM buffer for the following writes. - * Guarantees all dirty caches in the segments are written back and - * all metablocks in it are invalidated (Linked to null head). - */ -void acquire_new_seg(struct wb_device *wb, u64 id) +static void __acquire_new_seg(struct wb_device *wb, u64 id) { struct segment_header *new_seg = get_segment_header_by_id(wb, id); @@ -256,8 +342,17 @@ */ new_seg->id = id; wb->current_seg = new_seg; +} - acquire_new_rambuffer(wb, id); +/* + * Acquire the new segment and RAM buffer for the following writes. + * Guarantees all dirty caches in the segments are written back and + * all metablocks in it are invalidated (Linked to null head). + */ +void acquire_new_seg(struct wb_device *wb, u64 id) +{ + __acquire_new_rambuffer(wb, id); + __acquire_new_seg(wb, id); } static void prepare_new_seg(struct wb_device *wb) @@ -269,32 +364,15 @@ /*----------------------------------------------------------------------------*/ -static void copy_barrier_requests(struct flush_job *job, struct wb_device *wb) -{ - bio_list_init(&job->barrier_ios); - bio_list_merge(&job->barrier_ios, &wb->barrier_ios); - bio_list_init(&wb->barrier_ios); -} - -static void init_flush_job(struct flush_job *job, struct wb_device *wb) -{ - job->wb = wb; - job->seg = wb->current_seg; - - copy_barrier_requests(job, wb); -} - static void queue_flush_job(struct wb_device *wb) { - struct flush_job *job = &wb->current_rambuf->job; - wait_event(wb->inflight_ios_wq, !atomic_read(&wb->current_seg->nr_inflight_ios)); prepare_rambuffer(wb->current_rambuf, wb, wb->current_seg); - init_flush_job(job, wb); - INIT_WORK(&job->work, flush_proc); - queue_work(wb->flusher_wq, &job->work); + smp_wmb(); + atomic64_inc(&wb->last_queued_segment_id); + wake_up_process(wb->flush_daemon); } static void queue_current_buffer(struct wb_device *wb) @@ -303,10 +381,15 @@ prepare_new_seg(wb); } -void cursor_init(struct wb_device *wb) +/* + * queue_current_buffer if the RAM buffer can't make space any more. + */ +static void might_queue_current_buffer(struct wb_device *wb) { - wb->cursor = wb->current_seg->start_idx; - wb->current_seg->length = 0; + if (needs_queue_seg(wb)) { + update_nr_empty_segs(wb); + queue_current_buffer(wb); + } } /* @@ -320,8 +403,6 @@ old_seg = wb->current_seg; queue_current_buffer(wb); - - cursor_init(wb); /* FIXME this looks dup call */ mutex_unlock(&wb->io_lock); wait_for_flushing(wb, old_seg->id); @@ -360,458 +441,750 @@ /*----------------------------------------------------------------------------*/ -void inc_nr_dirty_caches(struct wb_device *wb) -{ - BUG_ON(!wb); - atomic64_inc(&wb->nr_dirty_caches); -} - -void dec_nr_dirty_caches(struct wb_device *wb) -{ - BUG_ON(!wb); - if (atomic64_dec_and_test(&wb->nr_dirty_caches)) - wake_up_interruptible(&wb->wait_drop_caches); -} +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,14,0) +#define bv_vec struct bio_vec +#define bv_page(vec) vec.bv_page +#define bv_offset(vec) vec.bv_offset +#define bv_len(vec) vec.bv_len +#define bv_it struct bvec_iter +#else +#define bv_vec struct bio_vec * +#define bv_page(vec) vec->bv_page +#define bv_offset(vec) vec->bv_offset +#define bv_len(vec) vec->bv_len +#define bv_it int +#endif -static bool taint_mb(struct wb_device *wb, struct metablock *mb, struct bio *bio) +/* + * Incoming bio may have multiple bio vecs as a result bvec merging. + * We shouldn't use bio_data directly to access to whole payload but + * should iterate over the vector. + */ +static void copy_bio_payload(void *buf, struct bio *bio) { - unsigned long flags; - bool flip = false; - - spin_lock_irqsave(&wb->mb_lock, flags); - if (!mb->dirtiness.is_dirty) { - mb->dirtiness.is_dirty = true; - flip = true; - } - - if (likely(io_fullsize(bio))) { - mb->dirtiness.data_bits = 255; - } else { - u8 i; - u8 acc_bits = 0; - for (i = io_offset(bio); i < (io_offset(bio) + bio_sectors(bio)); i++) - acc_bits += (1 << i); - - mb->dirtiness.data_bits |= acc_bits; + size_t sum = 0; + bv_vec vec; + bv_it it; + bio_for_each_segment(vec, bio, it) { + void *dst = kmap_atomic(bv_page(vec)); + size_t l = bv_len(vec); + memcpy(buf, dst + bv_offset(vec), l); + kunmap_atomic(dst); + buf += l; + sum += l; } - - BUG_ON(!bio_sectors(bio)); - BUG_ON(!mb->dirtiness.data_bits); - spin_unlock_irqrestore(&wb->mb_lock, flags); - - return flip; + ASSERT(sum == (bio_sectors(bio) << 9)); } -bool mark_clean_mb(struct wb_device *wb, struct metablock *mb) +/* + * Copy 512B buffer data to bio payload's i-th 512B area. + */ +static void __copy_to_bio_payload(struct bio *bio, void *buf, u8 i) { - unsigned long flags; - bool flip = false; + size_t head = 0; + size_t tail = head; - spin_lock_irqsave(&wb->mb_lock, flags); - if (mb->dirtiness.is_dirty) { - mb->dirtiness.is_dirty = false; - flip = true; + bv_vec vec; + bv_it it; + bio_for_each_segment(vec, bio, it) { + size_t l = bv_len(vec); + tail += l; + if ((i << 9) < tail) { + void *dst = kmap_atomic(bv_page(vec)); + size_t offset = (i << 9) - head; + BUG_ON((l - offset) < (1 << 9)); + memcpy(dst + bv_offset(vec) + offset, buf, 1 << 9); + kunmap_atomic(dst); + return; + } + head += l; } - spin_unlock_irqrestore(&wb->mb_lock, flags); - - return flip; + BUG(); } /* - * Read the dirtiness of a metablock at the moment. + * Copy 4KB buffer to bio payload with care to bio offset and copy bits. */ -struct dirtiness read_mb_dirtiness(struct wb_device *wb, struct segment_header *seg, - struct metablock *mb) +static void copy_to_bio_payload(struct bio *bio, void *buf, u8 copy_bits) { - unsigned long flags; - struct dirtiness retval; - - spin_lock_irqsave(&wb->mb_lock, flags); - retval = mb->dirtiness; - spin_unlock_irqrestore(&wb->mb_lock, flags); - - return retval; + u8 offset = bio_calc_offset(bio); + u8 i; + for (i = 0; i < bio_sectors(bio); i++) { + u8 i_offset = i + offset; + if (copy_bits & (1 << i_offset)) + __copy_to_bio_payload(bio, buf + (i_offset << 9), i); + } } /*----------------------------------------------------------------------------*/ -struct writeback_mb_context { - struct wb_device *wb; - atomic_t count; - int err; -}; - -static void writeback_mb_complete(int read_err, unsigned long write_err, void *__context) -{ - struct writeback_mb_context *context = __context; +struct lookup_result { + struct ht_head *head; /* Lookup head used */ + struct lookup_key key; /* Lookup key used */ - if (read_err || write_err) - context->err = 1; + struct segment_header *found_seg; + struct metablock *found_mb; - if (atomic_dec_and_test(&context->count)) - wake_up_active_wq(&context->wb->writeback_mb_wait_queue); -} + bool found; /* Cache hit? */ + bool on_buffer; /* Is the metablock found on the RAM buffer? */ +}; /* - * Write back a cache from cache device to the backing device. - * We don't need to make the data written back persistent because this segment - * will be reused only after writeback daemon wrote this segment back. + * Lookup a bio relevant cache data. + * In case of cache hit, nr_inflight_ios is incremented. */ -static void writeback_mb(struct wb_device *wb, struct segment_header *seg, - struct metablock *mb, u8 data_bits, bool thread) +static void cache_lookup(struct wb_device *wb, struct bio *bio, struct lookup_result *res) { - int r = 0; + res->key = (struct lookup_key) { + .sector = calc_cache_alignment(bi_sector(bio)), + }; + res->head = ht_get_head(wb, &res->key); - struct writeback_mb_context context; - context.wb = wb; - context.err = 0; + res->found_mb = ht_lookup(wb, res->head, &res->key); + if (res->found_mb) { + res->found_seg = mb_to_seg(wb, res->found_mb); + atomic_inc(&res->found_seg->nr_inflight_ios); + } - BUG_ON(!data_bits); + res->found = (res->found_mb != NULL); - if (data_bits == 255) { - struct dm_io_region src, dest; + res->on_buffer = false; + if (res->found) + res->on_buffer = is_on_buffer(wb, res->found_mb->idx); - atomic_set(&context.count, 1); + inc_stat(wb, bio_is_write(bio), res->found, res->on_buffer, bio_is_fullsize(bio)); +} - src = (struct dm_io_region) { - .bdev = wb->cache_dev->bdev, - .sector = calc_mb_start_sector(wb, seg, mb->idx), - .count = (1 << 3), - }; - dest = (struct dm_io_region) { - .bdev = wb->backing_dev->bdev, - .sector = mb->sector, - .count = (1 << 3), - }; - maybe_IO(dm_kcopyd_copy(wb->copier, &src, 1, &dest, 0, writeback_mb_complete, &context)); - if (r) - writeback_mb_complete(0, 0, &context); - } else { - u8 i; +static void dec_inflight_ios(struct wb_device *wb, struct segment_header *seg) +{ + if (atomic_dec_and_test(&seg->nr_inflight_ios)) + wake_up_active_wq(&wb->inflight_ios_wq); +} - u8 count = 0; - for (i = 0; i < 8; i++) - if (data_bits & (1 << i)) - count++; - - atomic_set(&context.count, count); - - for (i = 0; i < 8; i++) { - struct dm_io_region src, dest; - - if (!(data_bits & (1 << i))) - continue; - - src = (struct dm_io_region) { - .bdev = wb->cache_dev->bdev, - .sector = calc_mb_start_sector(wb, seg, mb->idx) + i, - .count = 1, - }; - dest = (struct dm_io_region) { - .bdev = wb->backing_dev->bdev, - .sector = mb->sector + i, - .count = 1, - }; - maybe_IO(dm_kcopyd_copy(wb->copier, &src, 1, &dest, 0, writeback_mb_complete, &context)); - if (r) - writeback_mb_complete(0, 0, &context); - } +/*----------------------------------------------------------------------------*/ + +static u8 to_mask(u8 offset, u8 count) +{ + u8 i; + u8 result = 0; + if (count == 8) { + result = 255; + } else { + for (i = 0; i < count; i++) + result |= (1 << (i + offset)); } + return result; +} - wait_event(wb->writeback_mb_wait_queue, !atomic_read(&context.count)); - if (context.err) - mark_dead(wb); +static int fill_payload_by_backing(struct wb_device *wb, struct bio *bio) +{ + struct dm_io_request io_req; + struct dm_io_region region; + + sector_t start = bi_sector(bio); + u8 offset = calc_offset(start); + u8 len = bio_sectors(bio); + u8 copy_bits = to_mask(offset, len); + + int err = 0; + void *buf = mempool_alloc(wb->buf_8_pool, GFP_NOIO); + if (!buf) + return -ENOMEM; + + io_req = (struct dm_io_request) { + WB_IO_READ, + .client = wb->io_client, + .notify.fn = NULL, + .mem.type = DM_IO_KMEM, + .mem.ptr.addr = buf + (offset << 9), + }; + region = (struct dm_io_region) { + .bdev = wb->backing_dev->bdev, + .sector = start, + .count = len, + }; + err = wb_io(&io_req, 1, ®ion, NULL, true); + if (err) + goto bad; + + copy_to_bio_payload(bio, buf, copy_bits); +bad: + mempool_free(buf, wb->buf_8_pool); + return err; } /* - * Write back a cache on the RAM buffer to backing device. - * Calling this function is really rare so the code needs not to be optimal. - * There is no need to write them back with FUA flag because the cache isn't - * flushed yet and thus isn't persistent. + * Get the reference to the 4KB-aligned data in RAM buffer. + * Since it only takes the reference caller need not to free the pointer. */ -static void writeback_buffered_mb(struct wb_device *wb, struct metablock *mb, u8 data_bits) +static void *ref_buffered_mb(struct wb_device *wb, struct metablock *mb) { - int r = 0; - sector_t offset = ((mb_idx_inseg(wb, mb->idx) + 1) << 3); - void *buf = mempool_alloc(wb->buf_1_pool, GFP_NOIO); + return wb->current_rambuf->data + (offset << 9); +} +/* + * Read cache block of the mb. + * Caller should free the returned pointer after used by mempool_alloc(). + */ +static void *read_mb(struct wb_device *wb, struct segment_header *seg, + struct metablock *mb, u8 data_bits) +{ u8 i; + void *result = mempool_alloc(wb->buf_8_pool, GFP_NOIO); + if (!result) + return NULL; + for (i = 0; i < 8; i++) { + int err = 0; struct dm_io_request io_req; struct dm_io_region region; - void *src; - sector_t dest; - if (!(data_bits & (1 << i))) continue; - src = wb->current_rambuf->data + ((offset + i) << SECTOR_SHIFT); - dest = mb->sector + i; - - memcpy(buf, src, 1 << SECTOR_SHIFT); io_req = (struct dm_io_request) { + WB_IO_READ, .client = wb->io_client, - .bi_rw = WRITE, .notify.fn = NULL, .mem.type = DM_IO_KMEM, - .mem.ptr.addr = buf, + .mem.ptr.addr = result + (i << 9), }; + region = (struct dm_io_region) { - .bdev = wb->backing_dev->bdev, - .sector = dest, + .bdev = wb->cache_dev->bdev, + .sector = calc_mb_start_sector(wb, seg, mb->idx) + i, .count = 1, }; - maybe_IO(wb_io(&io_req, 1, ®ion, NULL, true)); + + err = wb_io(&io_req, 1, ®ion, NULL, true); + if (err) { + mempool_free(result, wb->buf_8_pool); + return NULL; + } } - mempool_free(buf, wb->buf_1_pool); + return result; } -void prepare_overwrite(struct wb_device *wb, struct segment_header *seg, struct metablock *old_mb, bool overwrite_fullsize) +/*----------------------------------------------------------------------------*/ + +enum PBD_FLAG { + PBD_NONE = 0, + PBD_WILL_CACHE = 1, + PBD_READ_SEG = 2, +}; + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,6,0) +#define PER_BIO_DATA_SIZE per_io_data_size +#else +#define PER_BIO_DATA_SIZE per_bio_data_size +#endif +struct per_bio_data { + enum PBD_FLAG type; + union { + u32 cell_idx; + struct segment_header *seg; + }; +}; +#define per_bio_data(wb, bio) ((struct per_bio_data *)dm_per_bio_data((bio), (wb)->ti->PER_BIO_DATA_SIZE)) + +/*----------------------------------------------------------------------------*/ + +#define read_cache_cell_from_node(node) rb_entry((node), struct read_cache_cell, rb_node) + +static void read_cache_add(struct read_cache_cells *cells, struct read_cache_cell *cell) { - struct dirtiness dirtiness = read_mb_dirtiness(wb, seg, old_mb); + struct rb_node **rbp, *parent; + rbp = &cells->rb_root.rb_node; + parent = NULL; + while (*rbp) { + struct read_cache_cell *parent_cell; + parent = *rbp; + parent_cell = read_cache_cell_from_node(parent); + if (cell->sector < parent_cell->sector) + rbp = &(*rbp)->rb_left; + else + rbp = &(*rbp)->rb_right; + } + rb_link_node(&cell->rb_node, parent, rbp); + rb_insert_color(&cell->rb_node, &cells->rb_root); +} + +static struct read_cache_cell *lookup_read_cache_cell(struct wb_device *wb, sector_t sector) +{ + struct rb_node **rbp, *parent; + rbp = &wb->read_cache_cells->rb_root.rb_node; + parent = NULL; + while (*rbp) { + struct read_cache_cell *parent_cell; + parent = *rbp; + parent_cell = read_cache_cell_from_node(parent); + if (parent_cell->sector == sector) + return parent_cell; + + if (sector < parent_cell->sector) + rbp = &(*rbp)->rb_left; + else + rbp = &(*rbp)->rb_right; + } + return NULL; +} + +static void read_cache_cancel_cells(struct read_cache_cells *cells, u32 n) +{ + u32 i; + u32 last = cells->cursor + cells->seqcount; + if (last > cells->size) + last = cells->size; + for (i = cells->cursor; i < last; i++) { + struct read_cache_cell *cell = cells->array + i; + cell->cancelled = true; + } +} + +/* + * Track the forefront read address and cancel cells in case of over threshold. + * If the cell is cancelled foreground, we can save the memory copy in the background. + */ +static void read_cache_cancel_foreground(struct read_cache_cells *cells, + struct read_cache_cell *new_cell) +{ + if (new_cell->sector == (cells->last_sector + 8)) + cells->seqcount++; + else { + cells->seqcount = 1; + cells->over_threshold = false; + } + + if (cells->seqcount > cells->threshold) { + if (cells->over_threshold) + new_cell->cancelled = true; + else { + cells->over_threshold = true; + read_cache_cancel_cells(cells, cells->seqcount); + } + } + cells->last_sector = new_cell->sector; +} + +static bool reserve_read_cache_cell(struct wb_device *wb, struct bio *bio) +{ + struct per_bio_data *pbd; + struct read_cache_cells *cells = wb->read_cache_cells; + struct read_cache_cell *found, *new_cell; + + ASSERT(cells->threshold > 0); + + if (!ACCESS_ONCE(wb->read_cache_threshold)) + return false; + + if (!cells->cursor) + return false; /* - * First clean up the previous cache and write back the cache if needed. + * We only cache 4KB read data for following reasons: + * 1) Caching partial data (< 4KB) is likely meaningless. + * 2) Caching partial data makes the read-caching mechanism very hard. */ - bool needs_writeback_prev_cache = !overwrite_fullsize || !(dirtiness.data_bits == 255); + if (!bio_is_fullsize(bio)) + return false; /* - * Writeback works in background and may have cleaned up the metablock. - * If the metablock is clean we don't have to write back. + * We don't need to reserve the same address twice + * because it's either unchanged or invalidated. */ - if (!dirtiness.is_dirty) - needs_writeback_prev_cache = false; + found = lookup_read_cache_cell(wb, bi_sector(bio)); + if (found) + return false; - if (overwrite_fullsize) - needs_writeback_prev_cache = false; + cells->cursor--; + new_cell = cells->array + cells->cursor; + new_cell->sector = bi_sector(bio); + read_cache_add(cells, new_cell); - if (unlikely(needs_writeback_prev_cache)) { - wait_for_flushing(wb, seg->id); - BUG_ON(!dirtiness.is_dirty); - writeback_mb(wb, seg, old_mb, dirtiness.data_bits, true); - } + pbd = per_bio_data(wb, bio); + pbd->type = PBD_WILL_CACHE; + pbd->cell_idx = cells->cursor; - if (mark_clean_mb(wb, old_mb)) - dec_nr_dirty_caches(wb); + /* Cancel the new_cell if needed */ + read_cache_cancel_foreground(cells, new_cell); - ht_del(wb, old_mb); + return true; } -/*----------------------------------------------------------------------------*/ +static void might_cancel_read_cache_cell(struct wb_device *wb, struct bio *bio) +{ + struct read_cache_cell *found; + found = lookup_read_cache_cell(wb, calc_cache_alignment(bi_sector(bio))); + if (found) + found->cancelled = true; +} -#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,14,0) -#define bv_vec struct bio_vec -#define bv_page(vec) vec.bv_page -#define bv_offset(vec) vec.bv_offset -#define bv_len(vec) vec.bv_len -#define bv_it struct bvec_iter -#else -#define bv_vec struct bio_vec * -#define bv_page(vec) vec->bv_page -#define bv_offset(vec) vec->bv_offset -#define bv_len(vec) vec->bv_len -#define bv_it int -#endif +static void read_cache_cell_copy_data(struct wb_device *wb, struct bio *bio, unsigned long error) +{ + struct per_bio_data *pbd = per_bio_data(wb, bio); + struct read_cache_cells *cells = wb->read_cache_cells; + struct read_cache_cell *cell = cells->array + pbd->cell_idx; + + ASSERT(pbd->type == PBD_WILL_CACHE); + + /* Data can be broken. So don't stage. */ + if (error) + cell->cancelled = true; + + /* + * We can omit copying if the cell is cancelled but + * copying for a non-cancelled cell isn't problematic. + */ + if (!cell->cancelled) + copy_bio_payload(cell->data, bio); + + if (atomic_dec_and_test(&cells->ack_count)) + queue_work(cells->wq, &wb->read_cache_work); +} /* - * Incoming bio may have multiple bio vecs as a result bvec merging. - * We shouldn't use bio_data directly to access to whole payload but - * should iterate over the vector. + * Get a read cache cell through simplified write path if the cell data isn't stale. */ -static void copy_bio_payload(void *buf, struct bio *bio) +static void inject_read_cache(struct wb_device *wb, struct read_cache_cell *cell) { - bv_vec vec; - bv_it it; - bio_for_each_segment(vec, bio, it) { - size_t l = bv_len(vec); - memcpy(buf, page_address(bv_page(vec)) + bv_offset(vec), l); - buf += l; + struct metablock *mb; + u32 _mb_idx_inseg; + struct segment_header *seg; + + struct lookup_key key = { + .sector = cell->sector, + }; + struct ht_head *head = ht_get_head(wb, &key); + + mutex_lock(&wb->io_lock); + /* + * if might_cancel_read_cache_cell() on the foreground + * cancelled this cell, the data is now stale. + */ + if (cell->cancelled) { + mutex_unlock(&wb->io_lock); + return; } + + might_queue_current_buffer(wb); + + seg = wb->current_seg; + _mb_idx_inseg = mb_idx_inseg(wb, advance_cursor(wb)); + + /* + * We should copy the cell data into the rambuf with lock held + * otherwise subsequent write data may be written first and then overwritten by + * the old data in the cell. + */ + memcpy(wb->current_rambuf->data + ((_mb_idx_inseg + 1) << 12), cell->data, 1 << 12); + + mb = seg->mb_array + _mb_idx_inseg; + ASSERT(!mb->dirtiness.is_dirty); + mb->dirtiness.data_bits = 255; + + ht_register(wb, head, mb, &key); + + mutex_unlock(&wb->io_lock); + + dec_inflight_ios(wb, seg); } -static void write_on_rambuffer(struct wb_device *wb, struct metablock *write_pos, struct bio *bio) +static void free_read_cache_cell_data(struct read_cache_cells *cells) { - sector_t start_sector = ((mb_idx_inseg(wb, write_pos->idx) + 1) << 3) + io_offset(bio); - size_t start_byte = start_sector << SECTOR_SHIFT; - copy_bio_payload(wb->current_rambuf->data + start_byte, bio); + u32 i; + for (i = 0; i < cells->size; i++) { + struct read_cache_cell *cell = cells->array + i; + vfree(cell->data); + } } -/* - * Advance the cursor and return the old cursor. - * After returned, nr_inflight_ios is incremented to wait for this write to complete. - */ -static u32 advance_cursor(struct wb_device *wb) +static struct read_cache_cells *alloc_read_cache_cells(struct wb_device *wb, u32 n) { - u32 old; - if (wb->cursor == wb->nr_caches) - wb->cursor = 0; - old = wb->cursor; - wb->cursor++; - wb->current_seg->length++; - BUG_ON(wb->current_seg->length > wb->nr_caches_inseg); - atomic_inc(&wb->current_seg->nr_inflight_ios); - return old; + struct read_cache_cells *cells; + u32 i; + cells = kmalloc(sizeof(struct read_cache_cells), GFP_KERNEL); + if (!cells) + return NULL; + + cells->size = n; + cells->threshold = UINT_MAX; /* Default: every read will be cached */ + cells->last_sector = ~0; + cells->seqcount = 0; + cells->over_threshold = false; + cells->array = kmalloc(sizeof(struct read_cache_cell) * n, GFP_KERNEL); + if (!cells->array) + goto bad_cells_array; + + for (i = 0; i < cells->size; i++) { + struct read_cache_cell *cell = cells->array + i; + cell->data = vmalloc(1 << 12); + if (!cell->data) { + u32 j; + for (j = 0; j < i; j++) { + cell = cells->array + j; + vfree(cell->data); + } + goto bad_cell_data; + } + } + + cells->wq = create_singlethread_workqueue("dmwb_read_cache"); + if (!cells->wq) + goto bad_wq; + + return cells; + +bad_wq: + free_read_cache_cell_data(cells); +bad_cell_data: + kfree(cells->array); +bad_cells_array: + kfree(cells); + return NULL; } -static bool needs_queue_seg(struct wb_device *wb) +static void free_read_cache_cells(struct wb_device *wb) { - bool rambuf_no_space = !mb_idx_inseg(wb, wb->cursor); - return rambuf_no_space; + struct read_cache_cells *cells = wb->read_cache_cells; + destroy_workqueue(cells->wq); /* This drains wq. So, must precede the others */ + free_read_cache_cell_data(cells); + kfree(cells->array); + kfree(cells); } -/* - * queue_current_buffer if the RAM buffer can't make space any more. - */ -static void might_queue_current_buffer(struct wb_device *wb) +static void reinit_read_cache_cells(struct wb_device *wb) { - if (needs_queue_seg(wb)) - queue_current_buffer(wb); + struct read_cache_cells *cells = wb->read_cache_cells; + u32 i, cur_threshold; + + mutex_lock(&wb->io_lock); + cells->rb_root = RB_ROOT; + cells->cursor = cells->size; + atomic_set(&cells->ack_count, cells->size); + for (i = 0; i < cells->size; i++) { + struct read_cache_cell *cell = cells->array + i; + cell->cancelled = false; + } + cur_threshold = ACCESS_ONCE(wb->read_cache_threshold); + if (cur_threshold && (cur_threshold != cells->threshold)) { + cells->threshold = cur_threshold; + cells->over_threshold = false; + } + mutex_unlock(&wb->io_lock); } /* - * Process bio with REQ_DISCARD - * We only discard sectors on only the backing store because blocks on cache - * device are unlikely to be discarded. As discarding blocks is likely to be - * operated long after writing the block is likely to be written back before that. + * Cancel cells [first, last) */ -static int process_discard_bio(struct wb_device *wb, struct bio *bio) +static void visit_and_cancel_cells(struct rb_node *first, struct rb_node *last) { - bio_remap(bio, wb->backing_dev, bi_sector(bio)); - return DM_MAPIO_REMAPPED; + struct rb_node *rbp = first; + while (rbp != last) { + struct read_cache_cell *cell = read_cache_cell_from_node(rbp); + cell->cancelled = true; + rbp = rb_next(rbp); + } } /* - * Process bio with REQ_FLUSH + * Find out sequence from cells and cancel them if larger than threshold. */ -static int process_flush_bio(struct wb_device *wb, struct bio *bio) +static void read_cache_cancel_background(struct read_cache_cells *cells) { - /* In device-mapper bio with REQ_FLUSH is for sure to have no data. */ - BUG_ON(bi_size(bio)); - queue_barrier_io(wb, bio); - return DM_MAPIO_SUBMITTED; + struct rb_node *rbp = rb_first(&cells->rb_root); + struct rb_node *seqhead = rbp; + sector_t last_sector = ~0; + u32 seqcount = 0; + + while (rbp) { + struct read_cache_cell *cell = read_cache_cell_from_node(rbp); + if (cell->sector == (last_sector + 8)) + seqcount++; + else { + if (seqcount > cells->threshold) + visit_and_cancel_cells(seqhead, rbp); + seqcount = 1; + seqhead = rbp; + } + last_sector = cell->sector; + rbp = rb_next(rbp); + } + if (seqcount > cells->threshold) + visit_and_cancel_cells(seqhead, rbp); } -struct lookup_result { - struct ht_head *head; /* Lookup head used */ - struct lookup_key key; /* Lookup key used */ +static void read_cache_proc(struct work_struct *work) +{ + struct wb_device *wb = container_of(work, struct wb_device, read_cache_work); + struct read_cache_cells *cells = wb->read_cache_cells; + u32 i; - struct segment_header *found_seg; - struct metablock *found_mb; + read_cache_cancel_background(cells); - bool found; /* Cache hit? */ - bool on_buffer; /* Is the metablock found on the RAM buffer? */ -}; + for (i = 0; i < cells->size; i++) { + struct read_cache_cell *cell = cells->array + i; + inject_read_cache(wb, cell); + } -/* - * Lookup a bio relevant cache data. - * In case of cache hit, nr_inflight_ios is incremented. - */ -static void cache_lookup(struct wb_device *wb, struct bio *bio, struct lookup_result *res) + reinit_read_cache_cells(wb); +} + +static int init_read_cache_cells(struct wb_device *wb) { - res->key = (struct lookup_key) { - .sector = calc_cache_alignment(bi_sector(bio)), - }; - res->head = ht_get_head(wb, &res->key); + struct read_cache_cells *cells; + INIT_WORK(&wb->read_cache_work, read_cache_proc); + cells = alloc_read_cache_cells(wb, wb->nr_read_cache_cells); + if (!cells) + return -ENOMEM; + wb->read_cache_cells = cells; + reinit_read_cache_cells(wb); + return 0; +} - res->found_mb = ht_lookup(wb, res->head, &res->key); - if (res->found_mb) { - res->found_seg = mb_to_seg(wb, res->found_mb); - atomic_inc(&res->found_seg->nr_inflight_ios); +/*----------------------------------------------------------------------------*/ + +static void initialize_write_io(struct write_io *wio, struct bio *bio) +{ + u8 offset = bio_calc_offset(bio); + sector_t count = bio_sectors(bio); + copy_bio_payload(wio->data + (offset << 9), bio); + wio->data_bits = to_mask(offset, count); +} + +static void memcpy_masked(void *to, u8 protect_bits, void *from, u8 copy_bits) +{ + u8 i; + for (i = 0; i < 8; i++) { + bool will_copy = copy_bits & (1 << i); + bool protected = protect_bits & (1 << i); + if (will_copy && (!protected)) { + size_t offset = (i << 9); + memcpy(to + offset, from + offset, 1 << 9); + } } +} - res->found = (res->found_mb != NULL); +int prepare_overwrite(struct wb_device *wb, struct segment_header *seg, struct metablock *old_mb, struct write_io* wio, u8 overwrite_bits) +{ + struct dirtiness dirtiness = read_mb_dirtiness(wb, seg, old_mb); + + bool needs_merge_prev_cache = !(overwrite_bits == 255) || !(dirtiness.data_bits == 255); + + if (!dirtiness.is_dirty) + needs_merge_prev_cache = false; + + if (overwrite_bits == 255) + needs_merge_prev_cache = false; + + if (unlikely(needs_merge_prev_cache)) { + void *buf; + + wait_for_flushing(wb, seg->id); + ASSERT(dirtiness.is_dirty); + + buf = read_mb(wb, seg, old_mb, dirtiness.data_bits); + if (!buf) + return -EIO; + + /* newer data should be prioritized */ + memcpy_masked(wio->data, wio->data_bits, buf, dirtiness.data_bits); + wio->data_bits |= dirtiness.data_bits; + mempool_free(buf, wb->buf_8_pool); + } + + if (mark_clean_mb(wb, old_mb)) + dec_nr_dirty_caches(wb); - res->on_buffer = false; - if (res->found) - res->on_buffer = is_on_buffer(wb, res->found_mb->idx); + ht_del(wb, old_mb); - inc_stat(wb, io_write(bio), res->found, res->on_buffer, io_fullsize(bio)); + return 0; } /* - * Get new place to write. + * Get a new place to write. */ static struct metablock *prepare_new_write_pos(struct wb_device *wb) { struct metablock *ret = wb->current_seg->mb_array + mb_idx_inseg(wb, advance_cursor(wb)); - BUG_ON(ret->dirtiness.is_dirty); + ASSERT(!ret->dirtiness.is_dirty); ret->dirtiness.data_bits = 0; - BUG_ON(ret->dirtiness.data_bits); return ret; } -static void dec_inflight_ios(struct wb_device *wb, struct segment_header *seg) +static void write_on_rambuffer(struct wb_device *wb, struct metablock *write_pos, struct write_io *wio) { - if (atomic_dec_and_test(&seg->nr_inflight_ios)) - wake_up_active_wq(&wb->inflight_ios_wq); + size_t mb_offset = (mb_idx_inseg(wb, write_pos->idx) + 1) << 12; + void *mb_data = wb->current_rambuf->data + mb_offset; + if (wio->data_bits == 255) + memcpy(mb_data, wio->data, 1 << 12); + else + memcpy_masked(mb_data, 0, wio->data, wio->data_bits); } -static void might_cancel_read_cache_cell(struct wb_device *, struct bio *); -static struct metablock *prepare_write_pos(struct wb_device *wb, struct bio *bio) +static int do_process_write(struct wb_device *wb, struct bio *bio) { - struct metablock *ret; + int err = 0; + + struct metablock *write_pos = NULL; struct lookup_result res; + struct write_io wio; + wio.data = mempool_alloc(wb->buf_8_pool, GFP_NOIO); + if (!wio.data) + return -ENOMEM; + initialize_write_io(&wio, bio); + mutex_lock(&wb->io_lock); cache_lookup(wb, bio, &res); + if (res.found) { if (unlikely(res.on_buffer)) { - /* Overwrite on the ram buffer */ - mutex_unlock(&wb->io_lock); - return res.found_mb; + write_pos = res.found_mb; + goto do_write; } else { - /* - * Invalidate the old cache on the cache device because - * we can't overwrite cache block on the cache device. - */ - prepare_overwrite(wb, res.found_seg, res.found_mb, io_fullsize(bio)); + err = prepare_overwrite(wb, res.found_seg, res.found_mb, &wio, wio.data_bits); dec_inflight_ios(wb, res.found_seg); + if (err) + goto out; } } else might_cancel_read_cache_cell(wb, bio); - /* Write on a new position on the ram buffer */ - might_queue_current_buffer(wb); - ret = prepare_new_write_pos(wb); + write_pos = prepare_new_write_pos(wb); - ht_register(wb, res.head, ret, &res.key); +do_write: + ASSERT(write_pos); + write_on_rambuffer(wb, write_pos, &wio); - mutex_unlock(&wb->io_lock); + if (taint_mb(wb, write_pos, wio.data_bits)) + inc_nr_dirty_caches(wb); - return ret; + ht_register(wb, res.head, write_pos, &res.key); + +out: + mutex_unlock(&wb->io_lock); + mempool_free(wio.data, wb->buf_8_pool); + return err; } -/* - * Write bio data to RAM buffer. - */ -static int do_process_write(struct wb_device *wb, struct metablock *write_pos, struct bio *bio) +static int complete_process_write(struct wb_device *wb, struct bio *bio) { - if (taint_mb(wb, write_pos, bio)) - inc_nr_dirty_caches(wb); - - write_on_rambuffer(wb, write_pos, bio); - dec_inflight_ios(wb, wb->current_seg); /* - * bio with REQ_FUA has data. - * For such bio, we first treat it like a normal bio and then as a REQ_FLUSH bio. + * bio with FUA flag has data. + * We first handle it as a normal write bio and then as a barrier bio. */ - if (bio->bi_rw & REQ_FUA) { + if (bio_is_fua(bio)) { queue_barrier_io(wb, bio); return DM_MAPIO_SUBMITTED; } - if (is_live(wb)) - bio_endio_compat(bio, 0); - else - bio_endio_compat(bio, -EIO); - + bio_endio_compat(bio, 0); return DM_MAPIO_SUBMITTED; } @@ -837,22 +1210,24 @@ * 2) Wait for decrement outside the lock * * process_write: - * prepare_write_pos: + * do_process_write: * mutex_lock (to serialize write) * inc in_flight_ios # refcount on the dst segment * mutex_unlock * - * do_process_write: + * complete_process_write: * dec in_flight_ios * bio_endio(bio) */ static int process_write_wb(struct wb_device *wb, struct bio *bio) { - struct metablock *write_pos = prepare_write_pos(wb, bio); - return do_process_write(wb, write_pos, bio); + int err = do_process_write(wb, bio); + if (err) + return err; + return complete_process_write(wb, bio); } -static int process_write_wt(struct wb_device *wb, struct bio *bio) +static int process_write_wa(struct wb_device *wb, struct bio *bio) { struct lookup_result res; @@ -872,499 +1247,241 @@ static int process_write(struct wb_device *wb, struct bio *bio) { - return wb->write_through_mode ? process_write_wt(wb, bio) : process_write_wb(wb, bio); + return wb->write_around_mode ? process_write_wa(wb, bio) : process_write_wb(wb, bio); } -enum PBD_FLAG { - PBD_NONE = 0, - PBD_WILL_CACHE = 1, - PBD_READ_SEG = 2, -}; - -struct per_bio_data { - enum PBD_FLAG type; - union { - u32 cell_idx; - struct segment_header *seg; - }; +struct read_backing_async_context { + struct wb_device *wb; + struct bio *bio; }; -#define per_bio_data(wb, bio) ((struct per_bio_data *)dm_per_bio_data((bio), (wb)->ti->per_bio_data_size)) - -static void reserve_read_cache_cell(struct wb_device *, struct bio *); -static int process_read(struct wb_device *wb, struct bio *bio) -{ - struct lookup_result res; - struct dirtiness dirtiness; - - mutex_lock(&wb->io_lock); - cache_lookup(wb, bio, &res); - if (!res.found) - reserve_read_cache_cell(wb, bio); - mutex_unlock(&wb->io_lock); - - if (!res.found) { - bio_remap(bio, wb->backing_dev, bi_sector(bio)); - return DM_MAPIO_REMAPPED; - } - - dirtiness = read_mb_dirtiness(wb, res.found_seg, res.found_mb); - if (unlikely(res.on_buffer)) { - if (dirtiness.is_dirty) - writeback_buffered_mb(wb, res.found_mb, dirtiness.data_bits); - - dec_inflight_ios(wb, res.found_seg); - bio_remap(bio, wb->backing_dev, bi_sector(bio)); - return DM_MAPIO_REMAPPED; - } - - /* - * We need to wait for the segment to be flushed to the cache device. - * Without this, we might read the wrong data from the cache device. - */ - wait_for_flushing(wb, res.found_seg->id); - - if (likely(dirtiness.data_bits == 255)) { - struct per_bio_data *pbd = per_bio_data(wb, bio); - pbd->type = PBD_READ_SEG; - pbd->seg = res.found_seg; - - bio_remap(bio, wb->cache_dev, - calc_mb_start_sector(wb, res.found_seg, res.found_mb->idx) + - io_offset(bio)); - } else { - if (dirtiness.is_dirty) - writeback_mb(wb, res.found_seg, res.found_mb, dirtiness.data_bits, true); - if (mark_clean_mb(wb, res.found_mb)) - dec_nr_dirty_caches(wb); - dec_inflight_ios(wb, res.found_seg); - bio_remap(bio, wb->backing_dev, bi_sector(bio)); - } - - if (!is_live(wb)) - bio_io_error(bio); - - return DM_MAPIO_REMAPPED; -} - -static int process_bio(struct wb_device *wb, struct bio *bio) -{ - return io_write(bio) ? process_write(wb, bio) : process_read(wb, bio); -} - -static int writeboost_map(struct dm_target *ti, struct bio *bio) -{ - struct wb_device *wb = ti->private; - - struct per_bio_data *pbd = per_bio_data(wb, bio); - pbd->type = PBD_NONE; - - if (bio->bi_rw & REQ_DISCARD) - return process_discard_bio(wb, bio); - - if (bio->bi_rw & REQ_FLUSH) - return process_flush_bio(wb, bio); - - return process_bio(wb, bio); -} - -static void read_cache_cell_copy_data(struct wb_device *, struct bio*, int error); -static int writeboost_end_io(struct dm_target *ti, struct bio *bio, int error) -{ - struct wb_device *wb = ti->private; - struct per_bio_data *pbd = per_bio_data(wb, bio); - - switch (pbd->type) { - case PBD_NONE: - return 0; - case PBD_WILL_CACHE: - read_cache_cell_copy_data(wb, bio, error); - return 0; - case PBD_READ_SEG: - dec_inflight_ios(wb, pbd->seg); - return 0; - default: - BUG(); - } -} - -/*----------------------------------------------------------------------------*/ - -#define read_cache_cell_from_node(node) rb_entry((node), struct read_cache_cell, rb_node) - -static void read_cache_add(struct read_cache_cells *cells, struct read_cache_cell *cell) -{ - struct rb_node **rbp, *parent; - rbp = &cells->rb_root.rb_node; - parent = NULL; - while (*rbp) { - struct read_cache_cell *parent_cell; - parent = *rbp; - parent_cell = read_cache_cell_from_node(parent); - if (cell->sector < parent_cell->sector) - rbp = &(*rbp)->rb_left; - else - rbp = &(*rbp)->rb_right; - } - rb_link_node(&cell->rb_node, parent, rbp); - rb_insert_color(&cell->rb_node, &cells->rb_root); -} - -static struct read_cache_cell *lookup_read_cache_cell(struct wb_device *wb, sector_t sector) -{ - struct rb_node **rbp, *parent; - rbp = &wb->read_cache_cells->rb_root.rb_node; - parent = NULL; - while (*rbp) { - struct read_cache_cell *parent_cell; - parent = *rbp; - parent_cell = read_cache_cell_from_node(parent); - if (parent_cell->sector == sector) - return parent_cell; - - if (sector < parent_cell->sector) - rbp = &(*rbp)->rb_left; - else - rbp = &(*rbp)->rb_right; - } - return NULL; -} - -static void read_cache_cancel_cells(struct read_cache_cells *cells, u32 n) -{ - u32 i; - u32 last = cells->cursor + cells->seqcount; - if (last > cells->size) - last = cells->size; - for (i = cells->cursor; i < last; i++) { - struct read_cache_cell *cell = cells->array + i; - cell->cancelled = true; - } -} - -/* - * Track the forefront read address and cancel cells in case of over threshold. - * If the cell is cancelled foreground, we can save the memory copy in the background. - */ -static void read_cache_cancel_foreground(struct read_cache_cells *cells, - struct read_cache_cell *new_cell) -{ - if (new_cell->sector == (cells->last_sector + 8)) - cells->seqcount++; - else { - cells->seqcount = 1; - cells->over_threshold = false; - } - - if (cells->seqcount > cells->threshold) { - if (cells->over_threshold) - new_cell->cancelled = true; - else { - cells->over_threshold = true; - read_cache_cancel_cells(cells, cells->seqcount); - } - } - cells->last_sector = new_cell->sector; -} - -static void reserve_read_cache_cell(struct wb_device *wb, struct bio *bio) -{ - struct per_bio_data *pbd; - struct read_cache_cells *cells = wb->read_cache_cells; - struct read_cache_cell *found, *new_cell; - - BUG_ON(!cells->threshold); - - if (!ACCESS_ONCE(wb->read_cache_threshold)) - return; - - if (!cells->cursor) - return; - - /* - * We only cache 4KB read data for following reasons: - * 1) Caching partial data (< 4KB) is likely meaningless. - * 2) Caching partial data makes the read-caching mechanism very hard. - */ - if (!io_fullsize(bio)) - return; - - /* - * We don't need to reserve the same address twice - * because it's either unchanged or invalidated. - */ - found = lookup_read_cache_cell(wb, bi_sector(bio)); - if (found) - return; - - cells->cursor--; - new_cell = cells->array + cells->cursor; - new_cell->sector = bi_sector(bio); - read_cache_add(cells, new_cell); - - pbd = per_bio_data(wb, bio); - pbd->type = PBD_WILL_CACHE; - pbd->cell_idx = cells->cursor; - - /* Cancel the new_cell if needed */ - read_cache_cancel_foreground(cells, new_cell); -} -static void might_cancel_read_cache_cell(struct wb_device *wb, struct bio *bio) +static void read_backing_async_callback_onstack(unsigned long error, struct read_backing_async_context *ctx) { - struct read_cache_cell *found; - found = lookup_read_cache_cell(wb, calc_cache_alignment(bi_sector(bio))); - if (found) - found->cancelled = true; -} + ASSERT(bio_is_fullsize(ctx->bio)); -static void read_cache_cell_copy_data(struct wb_device *wb, struct bio *bio, int error) -{ - struct per_bio_data *pbd = per_bio_data(wb, bio); - struct read_cache_cells *cells = wb->read_cache_cells; - struct read_cache_cell *cell = cells->array + pbd->cell_idx; + read_cache_cell_copy_data(ctx->wb, ctx->bio, error); - /* Data can be broken. So don't stage. */ if (error) - cell->cancelled = true; - - /* - * We can omit copying if the cell is cancelled but - * copying for a non-cancelled cell isn't problematic. - */ - if (!cell->cancelled) - copy_bio_payload(cell->data, bio); - - if (atomic_dec_and_test(&cells->ack_count)) - queue_work(cells->wq, &wb->read_cache_work); + bio_io_error(ctx->bio); + else + bio_endio_compat(ctx->bio, 0); } -/* - * Get a read cache cell through simplified write path if the cell data isn't stale. - */ -static void inject_read_cache(struct wb_device *wb, struct read_cache_cell *cell) +static void read_backing_async_callback(unsigned long error, void *context) { - struct metablock *mb; - u32 _mb_idx_inseg; - struct ht_head *head; - struct segment_header *seg; + struct read_backing_async_context *ctx = context; + read_backing_async_callback_onstack(error, ctx); + kfree(ctx); +} - struct lookup_key key = { - .sector = cell->sector, - }; +static int read_backing_async(struct wb_device *wb, struct bio *bio) +{ + int err = 0; - mutex_lock(&wb->io_lock); - /* - * if might_cancel_read_cache_cell() on the foreground - * cancelled this cell, the data is now stale. - */ - if (cell->cancelled) { - mutex_unlock(&wb->io_lock); - return; - } + struct dm_io_request io_req; + struct dm_io_region region; - /* - * FIXME Why do we need to double-check here? - */ - head = ht_get_head(wb, &key); - mb = ht_lookup(wb, head, &key); - if (unlikely(mb)) { - mutex_unlock(&wb->io_lock); - return; - } + struct read_backing_async_context *ctx = kmalloc(sizeof(struct read_backing_async_context), GFP_NOIO); + if (!ctx) + return -ENOMEM; - might_queue_current_buffer(wb); + ctx->wb = wb; + ctx->bio = bio; - seg = wb->current_seg; - _mb_idx_inseg = mb_idx_inseg(wb, advance_cursor(wb)); - mb = seg->mb_array + _mb_idx_inseg; - BUG_ON(mb->dirtiness.is_dirty); - mb->dirtiness.data_bits = 255; + ASSERT(bio_is_fullsize(bio)); - ht_register(wb, head, mb, &key); + io_req = (struct dm_io_request) { + WB_IO_READ, + .client = wb->io_client, +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,14,0) + .mem.type = DM_IO_BIO, + .mem.ptr.bio = bio, +#else + .mem.type = DM_IO_BVEC, + .mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx, +#endif + .notify.fn = read_backing_async_callback, + .notify.context = ctx + }; + region = (struct dm_io_region) { + .bdev = wb->backing_dev->bdev, + .sector = bi_sector(bio), + .count = 8 + }; - mutex_unlock(&wb->io_lock); + err = wb_io(&io_req, 1, ®ion, NULL, false); + if (err) + kfree(ctx); - memcpy(wb->current_rambuf->data + ((_mb_idx_inseg + 1) << 12), cell->data, 1 << 12); - dec_inflight_ios(wb, seg); + return err; } -static void free_read_cache_cell_data(struct read_cache_cells *cells) +static int process_read(struct wb_device *wb, struct bio *bio) { - u32 i; - for (i = 0; i < cells->size; i++) { - struct read_cache_cell *cell = cells->array + i; - kfree(cell->data); - } -} + struct lookup_result res; + struct dirtiness dirtiness; + struct per_bio_data *pbd; -static struct read_cache_cells *alloc_read_cache_cells(struct wb_device *wb, u32 n) -{ - struct read_cache_cells *cells; - u32 i; - cells = kmalloc(sizeof(struct read_cache_cells), GFP_KERNEL); - if (!cells) - return NULL; + bool reserved = false; - cells->size = n; - cells->threshold = UINT_MAX; /* Default: every read will be cached */ - cells->last_sector = ~0; - cells->seqcount = 0; - cells->over_threshold = false; - cells->array = kmalloc(sizeof(struct read_cache_cell) * n, GFP_KERNEL); - if (!cells->array) - goto bad_cells_array; + mutex_lock(&wb->io_lock); + cache_lookup(wb, bio, &res); + if (!res.found) + reserved = reserve_read_cache_cell(wb, bio); + mutex_unlock(&wb->io_lock); - for (i = 0; i < cells->size; i++) { - struct read_cache_cell *cell = cells->array + i; - cell->data = kmalloc(1 << 12, GFP_KERNEL); - if (!cell->data) { - u32 j; - for (j = 0; j < i; j++) { - cell = cells->array + j; - kfree(cell->data); + if (!res.found) { + if (reserved) { + /* + * Remapping clone bio to the backing store leads to + * empty payload in clone_endio(). + * To avoid caching junk data, we need this workaround + * to call dm_io() to certainly fill the bio payload. + */ + if (read_backing_async(wb, bio)) { + struct read_backing_async_context ctx = { + .wb = wb, + .bio = bio + }; + read_backing_async_callback_onstack(1, &ctx); } - goto bad_cell_data; + return DM_MAPIO_SUBMITTED; + } else { + bio_remap(bio, wb->backing_dev, bi_sector(bio)); + return DM_MAPIO_REMAPPED; } } - cells->wq = create_singlethread_workqueue("dmwb_read_cache"); - if (!cells->wq) - goto bad_wq; + dirtiness = read_mb_dirtiness(wb, res.found_seg, res.found_mb); + if (unlikely(res.on_buffer)) { + int err = fill_payload_by_backing(wb, bio); + if (err) + goto read_buffered_mb_exit; - return cells; + if (dirtiness.is_dirty) + copy_to_bio_payload(bio, ref_buffered_mb(wb, res.found_mb), dirtiness.data_bits); -bad_wq: - free_read_cache_cell_data(cells); -bad_cell_data: - kfree(cells->array); -bad_cells_array: - kfree(cells); - return NULL; -} +read_buffered_mb_exit: + dec_inflight_ios(wb, res.found_seg); -static void free_read_cache_cells(struct wb_device *wb) -{ - struct read_cache_cells *cells = wb->read_cache_cells; - destroy_workqueue(cells->wq); /* This drains wq. So, must precede the others */ - free_read_cache_cell_data(cells); - kfree(cells->array); - kfree(cells); -} + if (unlikely(err)) + bio_io_error(bio); + else + bio_endio_compat(bio, 0); -static void reinit_read_cache_cells(struct wb_device *wb) -{ - struct read_cache_cells *cells = wb->read_cache_cells; - u32 i, cur_threshold; - for (i = 0; i < cells->size; i++) { - struct read_cache_cell *cell = cells->array + i; - cell->cancelled = false; + return DM_MAPIO_SUBMITTED; } - atomic_set(&cells->ack_count, cells->size); - mutex_lock(&wb->io_lock); - cells->rb_root = RB_ROOT; - cells->cursor = cells->size; - cur_threshold = ACCESS_ONCE(wb->read_cache_threshold); - if (cur_threshold && (cur_threshold != cells->threshold)) { - cells->threshold = cur_threshold; - cells->over_threshold = false; + /* + * We need to wait for the segment to be flushed to the cache device. + * Without this, we might read the wrong data from the cache device. + */ + wait_for_flushing(wb, res.found_seg->id); + + if (unlikely(dirtiness.data_bits != 255)) { + int err = fill_payload_by_backing(wb, bio); + if (err) + goto read_mb_exit; + + if (dirtiness.is_dirty) { + void *buf = read_mb(wb, res.found_seg, res.found_mb, dirtiness.data_bits); + if (!buf) { + err = -EIO; + goto read_mb_exit; + } + copy_to_bio_payload(bio, buf, dirtiness.data_bits); + mempool_free(buf, wb->buf_8_pool); + } + +read_mb_exit: + dec_inflight_ios(wb, res.found_seg); + + if (unlikely(err)) + bio_io_error(bio); + else + bio_endio_compat(bio, 0); + + return DM_MAPIO_SUBMITTED; } - mutex_unlock(&wb->io_lock); + + pbd = per_bio_data(wb, bio); + pbd->type = PBD_READ_SEG; + pbd->seg = res.found_seg; + + bio_remap(bio, wb->cache_dev, + calc_mb_start_sector(wb, res.found_seg, res.found_mb->idx) + + bio_calc_offset(bio)); + + return DM_MAPIO_REMAPPED; } -/* - * Cancel cells [first, last) - */ -static void visit_and_cancel_cells(struct rb_node *first, struct rb_node *last) +static int process_bio(struct wb_device *wb, struct bio *bio) { - struct rb_node *rbp = first; - while (rbp != last) { - struct read_cache_cell *cell = read_cache_cell_from_node(rbp); - cell->cancelled = true; - rbp = rb_next(rbp); - } + return bio_is_write(bio) ? process_write(wb, bio) : process_read(wb, bio); } -/* - * Find out sequence from cells and cancel them if larger than threshold. - */ -static void read_cache_cancel_background(struct read_cache_cells *cells) +static int process_barrier_bio(struct wb_device *wb, struct bio *bio) { - struct rb_node *rbp = rb_first(&cells->rb_root); - struct rb_node *seqhead = rbp; - sector_t last_sector = ~0; - u32 seqcount = 0; - - while (rbp) { - struct read_cache_cell *cell = read_cache_cell_from_node(rbp); - if (cell->sector == (last_sector + 8)) - seqcount++; - else { - if (seqcount > cells->threshold) - visit_and_cancel_cells(seqhead, rbp); - seqcount = 1; - seqhead = rbp; - } - last_sector = cell->sector; - rbp = rb_next(rbp); - } - if (seqcount > cells->threshold) - visit_and_cancel_cells(seqhead, rbp); + /* barrier bio doesn't have data */ + ASSERT(bio_sectors(bio) == 0); + queue_barrier_io(wb, bio); + return DM_MAPIO_SUBMITTED; } -static void read_cache_proc(struct work_struct *work) +static int writeboost_map(struct dm_target *ti, struct bio *bio) { - struct wb_device *wb = container_of(work, struct wb_device, read_cache_work); - struct read_cache_cells *cells = wb->read_cache_cells; - u32 i; + struct wb_device *wb = ti->private; - read_cache_cancel_background(cells); + struct per_bio_data *pbd = per_bio_data(wb, bio); + pbd->type = PBD_NONE; - for (i = 0; i < cells->size; i++) { - struct read_cache_cell *cell = cells->array + i; - inject_read_cache(wb, cell); - } - reinit_read_cache_cells(wb); + if (bio_is_barrier(bio)) + return process_barrier_bio(wb, bio); + + return process_bio(wb, bio); } -static int init_read_cache_cells(struct wb_device *wb) +static int writeboost_end_io(struct dm_target *ti, struct bio *bio, int error) { - struct read_cache_cells *cells; - INIT_WORK(&wb->read_cache_work, read_cache_proc); - cells = alloc_read_cache_cells(wb, 2048); /* 8MB */ - if (!cells) - return -ENOMEM; - wb->read_cache_cells = cells; - reinit_read_cache_cells(wb); - return 0; -} + struct wb_device *wb = ti->private; + struct per_bio_data *pbd = per_bio_data(wb, bio); -/*----------------------------------------------------------------------------*/ + switch (pbd->type) { + case PBD_NONE: + case PBD_WILL_CACHE: + return 0; + case PBD_READ_SEG: + dec_inflight_ios(wb, pbd->seg); + return 0; + default: + BUG(); + } +} static int consume_essential_argv(struct wb_device *wb, struct dm_arg_set *as) { - int r = 0; + int err = 0; struct dm_target *ti = wb->ti; - r = dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table), - &wb->backing_dev); - if (r) { + err = dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table), + &wb->backing_dev); + if (err) { DMERR("Failed to get backing_dev"); - return r; + return err; } - r = dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table), - &wb->cache_dev); - if (r) { + err = dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table), + &wb->cache_dev); + if (err) { DMERR("Failed to get cache_dev"); goto bad_get_cache; } - return r; + return err; bad_get_cache: dm_put_device(ti, wb->backing_dev); - return r; + return err; } #define consume_kv(name, nr, is_static) { \ @@ -1375,8 +1492,8 @@ DMERR("%s is a static option", #name); \ break; \ } \ - r = dm_read_arg(_args + (nr), as, &tmp, &ti->error); \ - if (r) { \ + err = dm_read_arg(_args + (nr), as, &tmp, &ti->error); \ + if (err) { \ DMERR("%s", ti->error); \ break; \ } \ @@ -1385,7 +1502,7 @@ static int do_consume_optional_argv(struct wb_device *wb, struct dm_arg_set *as, unsigned argc) { - int r = 0; + int err = 0; struct dm_target *ti = wb->ti; static struct dm_arg _args[] = { @@ -1394,7 +1511,8 @@ {0, 3600, "Invalid update_sb_record_interval"}, {0, 3600, "Invalid sync_data_interval"}, {0, 127, "Invalid read_cache_threshold"}, - {0, 1, "Invalid write_through_mode"}, + {0, 1, "Invalid write_around_mode"}, + {1, 2048, "Invalid nr_read_cache_cells"}, }; unsigned tmp; @@ -1402,16 +1520,17 @@ const char *key = dm_shift_arg(as); argc--; - r = -EINVAL; + err = -EINVAL; consume_kv(writeback_threshold, 0, false); consume_kv(nr_max_batched_writeback, 1, false); consume_kv(update_sb_record_interval, 2, false); consume_kv(sync_data_interval, 3, false); consume_kv(read_cache_threshold, 4, false); - consume_kv(write_through_mode, 5, true); + consume_kv(write_around_mode, 5, true); + consume_kv(nr_read_cache_cells, 6, true); - if (!r) { + if (!err) { argc--; } else { ti->error = "Invalid optional key"; @@ -1419,24 +1538,24 @@ } } - return r; + return err; } static int consume_optional_argv(struct wb_device *wb, struct dm_arg_set *as) { - int r = 0; + int err = 0; struct dm_target *ti = wb->ti; static struct dm_arg _args[] = { - {0, 12, "Invalid optional argc"}, + {0, 14, "Invalid optional argc"}, }; unsigned argc = 0; if (as->argc) { - r = dm_read_arg_group(_args, as, &argc, &ti->error); - if (r) { + err = dm_read_arg_group(_args, as, &argc, &ti->error); + if (err) { DMERR("%s", ti->error); - return r; + return err; } } @@ -1448,20 +1567,30 @@ static int init_core_struct(struct dm_target *ti) { - int r = 0; + int err = 0; struct wb_device *wb; - r = dm_set_target_max_io_len(ti, 1 << 3); - if (r) { + err = dm_set_target_max_io_len(ti, 1 << 3); + if (err) { DMERR("Failed to set max_io_len"); - return r; + return err; } - ti->flush_supported = true; ti->num_flush_bios = 1; - ti->num_discard_bios = 1; - ti->discard_zeroes_data_unsupported = true; - ti->per_bio_data_size = sizeof(struct per_bio_data); + ti->flush_supported = true; + + /* + * dm-writeboost does't support TRIM + * + * https://github.com/akiradeveloper/dm-writeboost/issues/110 + * - discarding backing data only violates DRAT + * - strictly discarding both cache blocks and backing data is nearly impossible + * considering cache hits may occur partially. + */ + ti->num_discard_bios = 0; + ti->discards_supported = false; + + ti->PER_BIO_DATA_SIZE = sizeof(struct per_bio_data); wb = kzalloc(sizeof(*wb), GFP_KERNEL); if (!wb) { @@ -1471,48 +1600,47 @@ ti->private = wb; wb->ti = ti; - init_waitqueue_head(&wb->writeback_mb_wait_queue); wb->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle); if (IS_ERR(wb->copier)) { - r = PTR_ERR(wb->copier); + err = PTR_ERR(wb->copier); goto bad_kcopyd_client; } wb->buf_1_cachep = kmem_cache_create("dmwb_buf_1", - 1 << 9, 1 << SECTOR_SHIFT, SLAB_RED_ZONE, NULL); + 1 << 9, 1 << 9, SLAB_RED_ZONE, NULL); if (!wb->buf_1_cachep) { - r = -ENOMEM; + err = -ENOMEM; goto bad_buf_1_cachep; } wb->buf_1_pool = mempool_create_slab_pool(16, wb->buf_1_cachep); if (!wb->buf_1_pool) { - r = -ENOMEM; + err = -ENOMEM; goto bad_buf_1_pool; } wb->buf_8_cachep = kmem_cache_create("dmwb_buf_8", 1 << 12, 1 << 12, SLAB_RED_ZONE, NULL); if (!wb->buf_8_cachep) { - r = -ENOMEM; + err = -ENOMEM; goto bad_buf_8_cachep; } wb->buf_8_pool = mempool_create_slab_pool(16, wb->buf_8_cachep); if (!wb->buf_8_pool) { - r = -ENOMEM; + err = -ENOMEM; goto bad_buf_8_pool; } wb->io_wq = create_singlethread_workqueue("dmwb_io"); if (!wb->io_wq) { DMERR("Failed to allocate io_wq"); - r = -ENOMEM; + err = -ENOMEM; goto bad_io_wq; } wb->io_client = dm_io_client_create(); if (IS_ERR(wb->io_client)) { DMERR("Failed to allocate io_client"); - r = PTR_ERR(wb->io_client); + err = PTR_ERR(wb->io_client); goto bad_io_client; } @@ -1520,10 +1648,9 @@ init_waitqueue_head(&wb->inflight_ios_wq); spin_lock_init(&wb->mb_lock); atomic64_set(&wb->nr_dirty_caches, 0); - clear_bit(WB_DEAD, &wb->flags); clear_bit(WB_CREATED, &wb->flags); - return r; + return err; bad_io_client: destroy_workqueue(wb->io_wq); @@ -1539,7 +1666,7 @@ dm_kcopyd_client_destroy(wb->copier); bad_kcopyd_client: kfree(wb); - return r; + return err; } static void free_core_struct(struct wb_device *wb) @@ -1586,6 +1713,9 @@ kfree(wb->ctr_args); } +#define save_arg(name) wb->name##_saved = wb->name +#define restore_arg(name) if (wb->name##_saved) { wb->name = wb->name##_saved; } + /* * Create a writeboost device * @@ -1597,47 +1727,55 @@ */ static int writeboost_ctr(struct dm_target *ti, unsigned int argc, char **argv) { - int r = 0; + int err = 0; struct wb_device *wb; struct dm_arg_set as; as.argc = argc; as.argv = argv; - r = init_core_struct(ti); - if (r) { + err = init_core_struct(ti); + if (err) { ti->error = "init_core_struct failed"; - return r; + return err; } wb = ti->private; - r = copy_ctr_args(wb, argc - 2, (const char **)argv + 2); - if (r) { + err = copy_ctr_args(wb, argc - 2, (const char **)argv + 2); + if (err) { ti->error = "copy_ctr_args failed"; goto bad_ctr_args; } - r = consume_essential_argv(wb, &as); - if (r) { + err = consume_essential_argv(wb, &as); + if (err) { ti->error = "consume_essential_argv failed"; goto bad_essential_argv; } - r = resume_cache(wb); - if (r) { - ti->error = "resume_cache failed"; - goto bad_resume_cache; - } - - wb->read_cache_threshold = 0; /* Default: read-caching disabled */ - r = consume_optional_argv(wb, &as); - if (r) { + err = consume_optional_argv(wb, &as); + if (err) { ti->error = "consume_optional_argv failed"; goto bad_optional_argv; } - r = init_read_cache_cells(wb); - if (r) { + save_arg(writeback_threshold); + save_arg(nr_max_batched_writeback); + save_arg(update_sb_record_interval); + save_arg(sync_data_interval); + save_arg(read_cache_threshold); + save_arg(nr_read_cache_cells); + + err = resume_cache(wb); + if (err) { + ti->error = "resume_cache failed"; + goto bad_resume_cache; + } + + wb->nr_read_cache_cells = 2048; /* 8MB */ + restore_arg(nr_read_cache_cells); + err = init_read_cache_cells(wb); + if (err) { ti->error = "init_read_cache_cells failed"; goto bad_read_cache_cells; } @@ -1645,21 +1783,28 @@ clear_stat(wb); set_bit(WB_CREATED, &wb->flags); - return r; + + restore_arg(writeback_threshold); + restore_arg(nr_max_batched_writeback); + restore_arg(update_sb_record_interval); + restore_arg(sync_data_interval); + restore_arg(read_cache_threshold); + + return err; bad_read_cache_cells: -bad_optional_argv: free_cache(wb); bad_resume_cache: dm_put_device(ti, wb->cache_dev); dm_put_device(ti, wb->backing_dev); +bad_optional_argv: bad_essential_argv: free_ctr_args(wb); bad_ctr_args: free_core_struct(wb); ti->private = NULL; - return r; + return err; } static void writeboost_dtr(struct dm_target *ti) @@ -1687,10 +1832,9 @@ */ static void writeboost_postsuspend(struct dm_target *ti) { - int r = 0; struct wb_device *wb = ti->private; flush_current_buffer(wb); - maybe_IO(blkdev_issue_flush(wb->cache_dev->bdev, GFP_NOIO, NULL)); + blkdev_issue_flush(wb->cache_dev->bdev, GFP_NOIO, NULL); } static int writeboost_message(struct dm_target *ti, unsigned argc, char **argv) @@ -1707,12 +1851,12 @@ } if (!strcasecmp(argv[0], "drop_caches")) { - int r = 0; + int err = 0; wb->force_drop = true; - r = wait_event_interruptible(wb->wait_drop_caches, - !atomic64_read(&wb->nr_dirty_caches)); + err = wait_event_interruptible(wb->wait_drop_caches, + !atomic64_read(&wb->nr_dirty_caches)); wb->force_drop = false; - return r; + return err; } return do_consume_optional_argv(wb, &as, 2); @@ -1796,7 +1940,7 @@ static struct target_type writeboost_target = { .name = "writeboost", - .version = {2, 1, 1}, + .version = {2, 2, 6}, .module = THIS_MODULE, .map = writeboost_map, .end_io = writeboost_end_io, @@ -1811,15 +1955,15 @@ static int __init writeboost_module_init(void) { - int r = 0; + int err = 0; - r = dm_register_target(&writeboost_target); - if (r < 0) { + err = dm_register_target(&writeboost_target); + if (err < 0) { DMERR("Failed to register target"); - return r; + return err; } - return r; + return err; } static void __exit writeboost_module_exit(void)