Date:2010-05-26 06:06:31 (12 years 4 months ago)
Author:Nitin Gupta
Commit:12e9bd2709bef6b8c49ec3e95edbc9e21807042e
Message:Support generic I/O requests

Currently, ramzwap devices (/dev/ramzswapX) can only
be used as swap disks since it was hard-coded to consider
only the first request in bio vector.

Now, we iterate over all the segments in an incoming
bio which allows us to handle all kinds of I/O requests.

ramzswap devices can still handle PAGE_SIZE aligned and
multiple of PAGE_SIZE sized I/O requests only. To ensure
that we get always get such requests only, we set following
request_queue attributes to PAGE_SIZE:
- physical_block_size
- logical_block_size
- io_min
- io_opt

Note: physical and logical block sizes were already set
equal to PAGE_SIZE and that seems to be sufficient to get
PAGE_SIZE aligned I/O.

Since we are no longer limited to handling swap requests
only, the next few patches rename ramzswap to zram. So,
the devices will then be called /dev/zram{0, 1, 2, ...}

Usage/Examples:
1) Use as /tmp storage
- mkfs.ext4 /dev/zram0
- mount /dev/zram0 /tmp

2) Use as swap:
- mkswap /dev/zram0
- swapon /dev/zram0 -p 10 # give highest priority to zram0

Performance:

- I/O benchamark done with 'dd' command. Details can be
found here:
http://code.google.com/p/compcache/wiki/zramperf
Summary:
- Maximum read speed (approx):
- ram disk: 1200 MB/sec
- zram disk: 600 MB/sec
- Maximum write speed (approx):
- ram disk: 500 MB/sec
- zram disk: 160 MB/sec

Issues:

- Double caching: We can potentially waste memory by having
two copies of a page -- one in page cache (uncompress) and
second in the device memory (compressed). However, during
reclaim, clean page cache pages are quickly freed, so this
does not seem to be a big problem.

- Stale data: Not all filesystems support issuing 'discard'
requests to underlying block devices. So, if such filesystems
are used over zram devices, we can accumulate lot of stale
data in memory. Even for filesystems to do support discard
(example, ext4), we need to see how effective it is.

- Scalability: There is only one (per-device) de/compression
buffer stats. This can lead to significant contention, especially
when used for generic (non-swap) purposes.

Signed-off-by: Nitin Gupta <ngupta@vflare.org>
Files: drivers/staging/ramzswap/ramzswap_drv.c (10 diffs)
drivers/staging/ramzswap/ramzswap_drv.h (1 diff)

Change Details

drivers/staging/ramzswap/ramzswap_drv.c
101101    rzs->disksize &= PAGE_MASK;
102102}
103103
104/*
105 * Swap header (1st page of swap device) contains information
106 * about a swap file/partition. Prepare such a header for the
107 * given ramzswap device so that swapon can identify it as a
108 * swap partition.
109 */
110static void setup_swap_header(struct ramzswap *rzs, union swap_header *s)
111{
112    s->info.version = 1;
113    s->info.last_page = (rzs->disksize >> PAGE_SHIFT) - 1;
114    s->info.nr_badpages = 0;
115    memcpy(s->magic.magic, "SWAPSPACE2", 10);
116}
117
118104static void ramzswap_ioctl_get_stats(struct ramzswap *rzs,
119105            struct ramzswap_ioctl_stats *s)
120106{
...... 
202188    rzs->table[index].offset = 0;
203189}
204190
205static int handle_zero_page(struct bio *bio)
191static void handle_zero_page(struct page *page)
206192{
207193    void *user_mem;
208    struct page *page = bio->bi_io_vec[0].bv_page;
209194
210195    user_mem = kmap_atomic(page, KM_USER0);
211196    memset(user_mem, 0, PAGE_SIZE);
212197    kunmap_atomic(user_mem, KM_USER0);
213198
214199    flush_dcache_page(page);
215
216    set_bit(BIO_UPTODATE, &bio->bi_flags);
217    bio_endio(bio, 0);
218    return 0;
219200}
220201
221static int handle_uncompressed_page(struct ramzswap *rzs, struct bio *bio)
202static void handle_uncompressed_page(struct ramzswap *rzs,
203                struct page *page, u32 index)
222204{
223    u32 index;
224    struct page *page;
225205    unsigned char *user_mem, *cmem;
226206
227    page = bio->bi_io_vec[0].bv_page;
228    index = bio->bi_sector >> SECTORS_PER_PAGE_SHIFT;
229
230207    user_mem = kmap_atomic(page, KM_USER0);
231208    cmem = kmap_atomic(rzs->table[index].page, KM_USER1) +
232209            rzs->table[index].offset;
...... 
236213    kunmap_atomic(cmem, KM_USER1);
237214
238215    flush_dcache_page(page);
239
240    set_bit(BIO_UPTODATE, &bio->bi_flags);
241    bio_endio(bio, 0);
242    return 0;
243}
244
245/*
246 * Called when request page is not present in ramzswap.
247 * This is an attempt to read before any previous write
248 * to this location - this happens due to readahead when
249 * swap device is read from user-space (e.g. during swapon)
250 */
251static int handle_ramzswap_fault(struct ramzswap *rzs, struct bio *bio)
252{
253    pr_debug("Read before write on swap device: "
254        "sector=%lu, size=%u, offset=%u\n",
255        (ulong)(bio->bi_sector), bio->bi_size,
256        bio->bi_io_vec[0].bv_offset);
257
258    /* Do nothing. Just return success */
259    set_bit(BIO_UPTODATE, &bio->bi_flags);
260    bio_endio(bio, 0);
261    return 0;
262216}
263217
264218static int ramzswap_read(struct ramzswap *rzs, struct bio *bio)
265219{
266    int ret;
220
221    int i;
267222    u32 index;
268    size_t clen;
269    struct page *page;
270    struct zobj_header *zheader;
271    unsigned char *user_mem, *cmem;
223    struct bio_vec *bvec;
272224
273225    rzs_stat64_inc(rzs, &rzs->stats.num_reads);
274226
275    page = bio->bi_io_vec[0].bv_page;
276227    index = bio->bi_sector >> SECTORS_PER_PAGE_SHIFT;
228    bio_for_each_segment(bvec, bio, i) {
229        int ret;
230        size_t clen;
231        struct page *page;
232        struct zobj_header *zheader;
233        unsigned char *user_mem, *cmem;
277234
278    if (rzs_test_flag(rzs, index, RZS_ZERO))
279        return handle_zero_page(bio);
235        page = bvec->bv_page;
280236
281    /* Requested page is not present in compressed area */
282    if (!rzs->table[index].page)
283        return handle_ramzswap_fault(rzs, bio);
237        if (rzs_test_flag(rzs, index, RZS_ZERO)) {
238            handle_zero_page(page);
239            continue;
240        }
284241
285    /* Page is stored uncompressed since it's incompressible */
286    if (unlikely(rzs_test_flag(rzs, index, RZS_UNCOMPRESSED)))
287        return handle_uncompressed_page(rzs, bio);
242        /* Requested page is not present in compressed area */
243        if (unlikely(!rzs->table[index].page)) {
244            pr_debug("Read before write: sector=%lu, size=%u",
245                (ulong)(bio->bi_sector), bio->bi_size);
246            /* Do nothing */
247            continue;
248        }
288249
289    user_mem = kmap_atomic(page, KM_USER0);
290    clen = PAGE_SIZE;
250        /* Page is stored uncompressed since it's incompressible */
251        if (unlikely(rzs_test_flag(rzs, index, RZS_UNCOMPRESSED))) {
252            handle_uncompressed_page(rzs, page, index);
253            continue;
254        }
291255
292    cmem = kmap_atomic(rzs->table[index].page, KM_USER1) +
293            rzs->table[index].offset;
256        user_mem = kmap_atomic(page, KM_USER0);
257        clen = PAGE_SIZE;
294258
295    ret = lzo1x_decompress_safe(
296        cmem + sizeof(*zheader),
297        xv_get_object_size(cmem) - sizeof(*zheader),
298        user_mem, &clen);
259        cmem = kmap_atomic(rzs->table[index].page, KM_USER1) +
260                rzs->table[index].offset;
299261
300    kunmap_atomic(user_mem, KM_USER0);
301    kunmap_atomic(cmem, KM_USER1);
262        ret = lzo1x_decompress_safe(
263            cmem + sizeof(*zheader),
264            xv_get_object_size(cmem) - sizeof(*zheader),
265            user_mem, &clen);
302266
303    /* should NEVER happen */
304    if (unlikely(ret != LZO_E_OK)) {
305        pr_err("Decompression failed! err=%d, page=%u\n",
306            ret, index);
307        rzs_stat64_inc(rzs, &rzs->stats.failed_reads);
308        goto out;
309    }
267        kunmap_atomic(user_mem, KM_USER0);
268        kunmap_atomic(cmem, KM_USER1);
310269
311    flush_dcache_page(page);
270        /* Should NEVER happen. Return bio error if it does. */
271        if (unlikely(ret != LZO_E_OK)) {
272            pr_err("Decompression failed! err=%d, page=%u\n",
273                ret, index);
274            rzs_stat64_inc(rzs, &rzs->stats.failed_reads);
275            goto out;
276        }
277
278        flush_dcache_page(page);
279        index++;
280    }
312281
313282    set_bit(BIO_UPTODATE, &bio->bi_flags);
314283    bio_endio(bio, 0);
...... 
321290
322291static int ramzswap_write(struct ramzswap *rzs, struct bio *bio)
323292{
324    int ret;
325    u32 offset, index;
326    size_t clen;
327    struct zobj_header *zheader;
328    struct page *page, *page_store;
329    unsigned char *user_mem, *cmem, *src;
293    int i;
294    u32 index;
295    struct bio_vec *bvec;
330296
331297    rzs_stat64_inc(rzs, &rzs->stats.num_writes);
332298
333    page = bio->bi_io_vec[0].bv_page;
334299    index = bio->bi_sector >> SECTORS_PER_PAGE_SHIFT;
335300
336    src = rzs->compress_buffer;
301    bio_for_each_segment(bvec, bio, i) {
302        int ret;
303        u32 offset;
304        size_t clen;
305        struct zobj_header *zheader;
306        struct page *page, *page_store;
307        unsigned char *user_mem, *cmem, *src;
337308
338    mutex_lock(&rzs->lock);
309        page = bvec->bv_page;
310        src = rzs->compress_buffer;
339311
340    user_mem = kmap_atomic(page, KM_USER0);
341    if (page_zero_filled(user_mem)) {
342        kunmap_atomic(user_mem, KM_USER0);
343        mutex_unlock(&rzs->lock);
344        rzs_stat_inc(&rzs->stats.pages_zero);
345        rzs_set_flag(rzs, index, RZS_ZERO);
312        /*
313         * System overwrites unused sectors. Free memory associated
314         * with this sector now.
315         */
316        if (rzs->table[index].page ||
317                rzs_test_flag(rzs, index, RZS_ZERO))
318            ramzswap_free_page(rzs, index);
346319
347        set_bit(BIO_UPTODATE, &bio->bi_flags);
348        bio_endio(bio, 0);
349        return 0;
350    }
320        mutex_lock(&rzs->lock);
351321
352    ret = lzo1x_1_compress(user_mem, PAGE_SIZE, src, &clen,
353                rzs->compress_workmem);
322        user_mem = kmap_atomic(page, KM_USER0);
323        if (page_zero_filled(user_mem)) {
324            kunmap_atomic(user_mem, KM_USER0);
325            mutex_unlock(&rzs->lock);
326            rzs_stat_inc(&rzs->stats.pages_zero);
327            rzs_set_flag(rzs, index, RZS_ZERO);
328            continue;
329        }
354330
355    kunmap_atomic(user_mem, KM_USER0);
331        ret = lzo1x_1_compress(user_mem, PAGE_SIZE, src, &clen,
332                    rzs->compress_workmem);
356333
357    if (unlikely(ret != LZO_E_OK)) {
358        mutex_unlock(&rzs->lock);
359        pr_err("Compression failed! err=%d\n", ret);
360        rzs_stat64_inc(rzs, &rzs->stats.failed_writes);
361        goto out;
362    }
334        kunmap_atomic(user_mem, KM_USER0);
363335
364    /*
365     * Page is incompressible. Store it as-is (uncompressed)
366     * since we do not want to return too many swap write
367     * errors which has side effect of hanging the system.
368     */
369    if (unlikely(clen > max_zpage_size)) {
370        clen = PAGE_SIZE;
371        page_store = alloc_page(GFP_NOIO | __GFP_HIGHMEM);
372        if (unlikely(!page_store)) {
336        if (unlikely(ret != LZO_E_OK)) {
373337            mutex_unlock(&rzs->lock);
374            pr_info("Error allocating memory for incompressible "
375                "page: %u\n", index);
338            pr_err("Compression failed! err=%d\n", ret);
376339            rzs_stat64_inc(rzs, &rzs->stats.failed_writes);
377340            goto out;
378341        }
379342
380        offset = 0;
381        rzs_set_flag(rzs, index, RZS_UNCOMPRESSED);
382        rzs_stat_inc(&rzs->stats.pages_expand);
383        rzs->table[index].page = page_store;
384        src = kmap_atomic(page, KM_USER0);
385        goto memstore;
386    }
343        /*
344         * Page is incompressible. Store it as-is (uncompressed)
345         * since we do not want to return too many swap write
346         * errors which has side effect of hanging the system.
347         */
348        if (unlikely(clen > max_zpage_size)) {
349            clen = PAGE_SIZE;
350            page_store = alloc_page(GFP_NOIO | __GFP_HIGHMEM);
351            if (unlikely(!page_store)) {
352                mutex_unlock(&rzs->lock);
353                pr_info("Error allocating memory for "
354                    "incompressible page: %u\n", index);
355                rzs_stat64_inc(rzs, &rzs->stats.failed_writes);
356                goto out;
357            }
358
359            offset = 0;
360            rzs_set_flag(rzs, index, RZS_UNCOMPRESSED);
361            rzs_stat_inc(&rzs->stats.pages_expand);
362            rzs->table[index].page = page_store;
363            src = kmap_atomic(page, KM_USER0);
364            goto memstore;
365        }
387366
388    if (xv_malloc(rzs->mem_pool, clen + sizeof(*zheader),
389            &rzs->table[index].page, &offset,
390            GFP_NOIO | __GFP_HIGHMEM)) {
391        mutex_unlock(&rzs->lock);
392        pr_info("Error allocating memory for compressed "
393            "page: %u, size=%zu\n", index, clen);
394        rzs_stat64_inc(rzs, &rzs->stats.failed_writes);
395        goto out;
396    }
367        if (xv_malloc(rzs->mem_pool, clen + sizeof(*zheader),
368                &rzs->table[index].page, &offset,
369                GFP_NOIO | __GFP_HIGHMEM)) {
370            mutex_unlock(&rzs->lock);
371            pr_info("Error allocating memory for compressed "
372                "page: %u, size=%zu\n", index, clen);
373            rzs_stat64_inc(rzs, &rzs->stats.failed_writes);
374            goto out;
375        }
397376
398377memstore:
399    rzs->table[index].offset = offset;
378        rzs->table[index].offset = offset;
400379
401    cmem = kmap_atomic(rzs->table[index].page, KM_USER1) +
402            rzs->table[index].offset;
380        cmem = kmap_atomic(rzs->table[index].page, KM_USER1) +
381                rzs->table[index].offset;
403382
404383#if 0
405    /* Back-reference needed for memory defragmentation */
406    if (!rzs_test_flag(rzs, index, RZS_UNCOMPRESSED)) {
407        zheader = (struct zobj_header *)cmem;
408        zheader->table_idx = index;
409        cmem += sizeof(*zheader);
410    }
384        /* Back-reference needed for memory defragmentation */
385        if (!rzs_test_flag(rzs, index, RZS_UNCOMPRESSED)) {
386            zheader = (struct zobj_header *)cmem;
387            zheader->table_idx = index;
388            cmem += sizeof(*zheader);
389        }
411390#endif
412391
413    memcpy(cmem, src, clen);
392        memcpy(cmem, src, clen);
414393
415    kunmap_atomic(cmem, KM_USER1);
416    if (unlikely(rzs_test_flag(rzs, index, RZS_UNCOMPRESSED)))
417        kunmap_atomic(src, KM_USER0);
394        kunmap_atomic(cmem, KM_USER1);
395        if (unlikely(rzs_test_flag(rzs, index, RZS_UNCOMPRESSED)))
396            kunmap_atomic(src, KM_USER0);
418397
419    /* Update stats */
420    rzs->stats.compr_size += clen;
421    rzs_stat_inc(&rzs->stats.pages_stored);
422    if (clen <= PAGE_SIZE / 2)
423        rzs_stat_inc(&rzs->stats.good_compress);
398        /* Update stats */
399        rzs->stats.compr_size += clen;
400        rzs_stat_inc(&rzs->stats.pages_stored);
401        if (clen <= PAGE_SIZE / 2)
402            rzs_stat_inc(&rzs->stats.good_compress);
424403
425    mutex_unlock(&rzs->lock);
404        mutex_unlock(&rzs->lock);
405        index++;
406    }
426407
427408    set_bit(BIO_UPTODATE, &bio->bi_flags);
428409    bio_endio(bio, 0);
...... 
436417/*
437418 * Check if request is within bounds and page aligned.
438419 */
439static inline int valid_swap_request(struct ramzswap *rzs, struct bio *bio)
420static inline int valid_io_request(struct ramzswap *rzs, struct bio *bio)
440421{
441422    if (unlikely(
442423        (bio->bi_sector >= (rzs->disksize >> SECTOR_SHIFT)) ||
443424        (bio->bi_sector & (SECTORS_PER_PAGE - 1)) ||
444        (bio->bi_vcnt != 1) ||
445        (bio->bi_size != PAGE_SIZE) ||
446        (bio->bi_io_vec[0].bv_offset != 0))) {
425        (bio->bi_size & (PAGE_SIZE - 1)))) {
447426
448427        return 0;
449428    }
450429
451    /* swap request is valid */
430    /* I/O request is valid */
452431    return 1;
453432}
454433
...... 
465444        return 0;
466445    }
467446
468    if (!valid_swap_request(rzs, bio)) {
447    if (!valid_io_request(rzs, bio)) {
469448        rzs_stat64_inc(rzs, &rzs->stats.invalid_io);
470449        bio_io_error(bio);
471450        return 0;
...... 
531510{
532511    int ret;
533512    size_t num_pages;
534    struct page *page;
535    union swap_header *swap_header;
536513
537514    if (rzs->init_done) {
538515        pr_info("Device already initialized!\n");
...... 
566543    }
567544    memset(rzs->table, 0, num_pages * sizeof(*rzs->table));
568545
569    page = alloc_page(__GFP_ZERO);
570    if (!page) {
571        pr_err("Error allocating swap header page\n");
572        ret = -ENOMEM;
573        goto fail;
574    }
575    rzs->table[0].page = page;
576    rzs_set_flag(rzs, 0, RZS_UNCOMPRESSED);
577
578    swap_header = kmap(page);
579    setup_swap_header(rzs, swap_header);
580    kunmap(page);
581
582546    set_capacity(rzs->disk, rzs->disksize >> SECTOR_SHIFT);
583547
584548    /* ramzswap devices sort of resembles non-rotational disks */
...... 
689653    rzs = bdev->bd_disk->private_data;
690654    ramzswap_free_page(rzs, index);
691655    rzs_stat64_inc(rzs, &rzs->stats.notify_free);
692
693    return;
694656}
695657
696static struct block_device_operations ramzswap_devops = {
658static const struct block_device_operations ramzswap_devops = {
697659    .ioctl = ramzswap_ioctl,
698660    .swap_slot_free_notify = ramzswap_slot_free_notify,
699661    .owner = THIS_MODULE
...... 
737699    /* Actual capacity set using RZSIO_SET_DISKSIZE_KB ioctl */
738700    set_capacity(rzs->disk, 0);
739701
702    /*
703     * To ensure that we always get PAGE_SIZE aligned
704     * and n*PAGE_SIZED sized I/O requests.
705     */
740706    blk_queue_physical_block_size(rzs->disk->queue, PAGE_SIZE);
741707    blk_queue_logical_block_size(rzs->disk->queue, PAGE_SIZE);
708    blk_queue_io_min(rzs->disk->queue, PAGE_SIZE);
709    blk_queue_io_opt(rzs->disk->queue, PAGE_SIZE);
742710
743711    add_disk(rzs->disk);
744712
drivers/staging/ramzswap/ramzswap_drv.h
112112    void *compress_buffer;
113113    struct table *table;
114114    spinlock_t stat64_lock; /* protect 64-bit stats */
115    struct mutex lock;
115    struct mutex lock; /* protect compression buffers against
116                 * concurrent writes */
116117    struct request_queue *queue;
117118    struct gendisk *disk;
118119    int init_done;

Archive Download the corresponding diff file



interactive