Root/fs/mbcache.c

1/*
2 * linux/fs/mbcache.c
3 * (C) 2001-2002 Andreas Gruenbacher, <a.gruenbacher@computer.org>
4 */
5
6/*
7 * Filesystem Meta Information Block Cache (mbcache)
8 *
9 * The mbcache caches blocks of block devices that need to be located
10 * by their device/block number, as well as by other criteria (such
11 * as the block's contents).
12 *
13 * There can only be one cache entry in a cache per device and block number.
14 * Additional indexes need not be unique in this sense. The number of
15 * additional indexes (=other criteria) can be hardwired at compile time
16 * or specified at cache create time.
17 *
18 * Each cache entry is of fixed size. An entry may be `valid' or `invalid'
19 * in the cache. A valid entry is in the main hash tables of the cache,
20 * and may also be in the lru list. An invalid entry is not in any hashes
21 * or lists.
22 *
23 * A valid cache entry is only in the lru list if no handles refer to it.
24 * Invalid cache entries will be freed when the last handle to the cache
25 * entry is released. Entries that cannot be freed immediately are put
26 * back on the lru list.
27 */
28
29#include <linux/kernel.h>
30#include <linux/module.h>
31
32#include <linux/hash.h>
33#include <linux/fs.h>
34#include <linux/mm.h>
35#include <linux/slab.h>
36#include <linux/sched.h>
37#include <linux/init.h>
38#include <linux/mbcache.h>
39
40
41#ifdef MB_CACHE_DEBUG
42# define mb_debug(f...) do { \
43        printk(KERN_DEBUG f); \
44        printk("\n"); \
45    } while (0)
46#define mb_assert(c) do { if (!(c)) \
47        printk(KERN_ERR "assertion " #c " failed\n"); \
48    } while(0)
49#else
50# define mb_debug(f...) do { } while(0)
51# define mb_assert(c) do { } while(0)
52#endif
53#define mb_error(f...) do { \
54        printk(KERN_ERR f); \
55        printk("\n"); \
56    } while(0)
57
58#define MB_CACHE_WRITER ((unsigned short)~0U >> 1)
59
60static DECLARE_WAIT_QUEUE_HEAD(mb_cache_queue);
61        
62MODULE_AUTHOR("Andreas Gruenbacher <a.gruenbacher@computer.org>");
63MODULE_DESCRIPTION("Meta block cache (for extended attributes)");
64MODULE_LICENSE("GPL");
65
66EXPORT_SYMBOL(mb_cache_create);
67EXPORT_SYMBOL(mb_cache_shrink);
68EXPORT_SYMBOL(mb_cache_destroy);
69EXPORT_SYMBOL(mb_cache_entry_alloc);
70EXPORT_SYMBOL(mb_cache_entry_insert);
71EXPORT_SYMBOL(mb_cache_entry_release);
72EXPORT_SYMBOL(mb_cache_entry_free);
73EXPORT_SYMBOL(mb_cache_entry_get);
74#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0)
75EXPORT_SYMBOL(mb_cache_entry_find_first);
76EXPORT_SYMBOL(mb_cache_entry_find_next);
77#endif
78
79struct mb_cache {
80    struct list_head c_cache_list;
81    const char *c_name;
82    struct mb_cache_op c_op;
83    atomic_t c_entry_count;
84    int c_bucket_bits;
85#ifndef MB_CACHE_INDEXES_COUNT
86    int c_indexes_count;
87#endif
88    struct kmem_cache *c_entry_cache;
89    struct list_head *c_block_hash;
90    struct list_head *c_indexes_hash[0];
91};
92
93
94/*
95 * Global data: list of all mbcache's, lru list, and a spinlock for
96 * accessing cache data structures on SMP machines. The lru list is
97 * global across all mbcaches.
98 */
99
100static LIST_HEAD(mb_cache_list);
101static LIST_HEAD(mb_cache_lru_list);
102static DEFINE_SPINLOCK(mb_cache_spinlock);
103
104static inline int
105mb_cache_indexes(struct mb_cache *cache)
106{
107#ifdef MB_CACHE_INDEXES_COUNT
108    return MB_CACHE_INDEXES_COUNT;
109#else
110    return cache->c_indexes_count;
111#endif
112}
113
114/*
115 * What the mbcache registers as to get shrunk dynamically.
116 */
117
118static int mb_cache_shrink_fn(int nr_to_scan, gfp_t gfp_mask);
119
120static struct shrinker mb_cache_shrinker = {
121    .shrink = mb_cache_shrink_fn,
122    .seeks = DEFAULT_SEEKS,
123};
124
125static inline int
126__mb_cache_entry_is_hashed(struct mb_cache_entry *ce)
127{
128    return !list_empty(&ce->e_block_list);
129}
130
131
132static void
133__mb_cache_entry_unhash(struct mb_cache_entry *ce)
134{
135    int n;
136
137    if (__mb_cache_entry_is_hashed(ce)) {
138        list_del_init(&ce->e_block_list);
139        for (n=0; n<mb_cache_indexes(ce->e_cache); n++)
140            list_del(&ce->e_indexes[n].o_list);
141    }
142}
143
144
145static void
146__mb_cache_entry_forget(struct mb_cache_entry *ce, gfp_t gfp_mask)
147{
148    struct mb_cache *cache = ce->e_cache;
149
150    mb_assert(!(ce->e_used || ce->e_queued));
151    if (cache->c_op.free && cache->c_op.free(ce, gfp_mask)) {
152        /* free failed -- put back on the lru list
153           for freeing later. */
154        spin_lock(&mb_cache_spinlock);
155        list_add(&ce->e_lru_list, &mb_cache_lru_list);
156        spin_unlock(&mb_cache_spinlock);
157    } else {
158        kmem_cache_free(cache->c_entry_cache, ce);
159        atomic_dec(&cache->c_entry_count);
160    }
161}
162
163
164static void
165__mb_cache_entry_release_unlock(struct mb_cache_entry *ce)
166    __releases(mb_cache_spinlock)
167{
168    /* Wake up all processes queuing for this cache entry. */
169    if (ce->e_queued)
170        wake_up_all(&mb_cache_queue);
171    if (ce->e_used >= MB_CACHE_WRITER)
172        ce->e_used -= MB_CACHE_WRITER;
173    ce->e_used--;
174    if (!(ce->e_used || ce->e_queued)) {
175        if (!__mb_cache_entry_is_hashed(ce))
176            goto forget;
177        mb_assert(list_empty(&ce->e_lru_list));
178        list_add_tail(&ce->e_lru_list, &mb_cache_lru_list);
179    }
180    spin_unlock(&mb_cache_spinlock);
181    return;
182forget:
183    spin_unlock(&mb_cache_spinlock);
184    __mb_cache_entry_forget(ce, GFP_KERNEL);
185}
186
187
188/*
189 * mb_cache_shrink_fn() memory pressure callback
190 *
191 * This function is called by the kernel memory management when memory
192 * gets low.
193 *
194 * @nr_to_scan: Number of objects to scan
195 * @gfp_mask: (ignored)
196 *
197 * Returns the number of objects which are present in the cache.
198 */
199static int
200mb_cache_shrink_fn(int nr_to_scan, gfp_t gfp_mask)
201{
202    LIST_HEAD(free_list);
203    struct list_head *l, *ltmp;
204    int count = 0;
205
206    spin_lock(&mb_cache_spinlock);
207    list_for_each(l, &mb_cache_list) {
208        struct mb_cache *cache =
209            list_entry(l, struct mb_cache, c_cache_list);
210        mb_debug("cache %s (%d)", cache->c_name,
211              atomic_read(&cache->c_entry_count));
212        count += atomic_read(&cache->c_entry_count);
213    }
214    mb_debug("trying to free %d entries", nr_to_scan);
215    if (nr_to_scan == 0) {
216        spin_unlock(&mb_cache_spinlock);
217        goto out;
218    }
219    while (nr_to_scan-- && !list_empty(&mb_cache_lru_list)) {
220        struct mb_cache_entry *ce =
221            list_entry(mb_cache_lru_list.next,
222                   struct mb_cache_entry, e_lru_list);
223        list_move_tail(&ce->e_lru_list, &free_list);
224        __mb_cache_entry_unhash(ce);
225    }
226    spin_unlock(&mb_cache_spinlock);
227    list_for_each_safe(l, ltmp, &free_list) {
228        __mb_cache_entry_forget(list_entry(l, struct mb_cache_entry,
229                           e_lru_list), gfp_mask);
230    }
231out:
232    return (count / 100) * sysctl_vfs_cache_pressure;
233}
234
235
236/*
237 * mb_cache_create() create a new cache
238 *
239 * All entries in one cache are equal size. Cache entries may be from
240 * multiple devices. If this is the first mbcache created, registers
241 * the cache with kernel memory management. Returns NULL if no more
242 * memory was available.
243 *
244 * @name: name of the cache (informal)
245 * @cache_op: contains the callback called when freeing a cache entry
246 * @entry_size: The size of a cache entry, including
247 * struct mb_cache_entry
248 * @indexes_count: number of additional indexes in the cache. Must equal
249 * MB_CACHE_INDEXES_COUNT if the number of indexes is
250 * hardwired.
251 * @bucket_bits: log2(number of hash buckets)
252 */
253struct mb_cache *
254mb_cache_create(const char *name, struct mb_cache_op *cache_op,
255        size_t entry_size, int indexes_count, int bucket_bits)
256{
257    int m=0, n, bucket_count = 1 << bucket_bits;
258    struct mb_cache *cache = NULL;
259
260    if(entry_size < sizeof(struct mb_cache_entry) +
261       indexes_count * sizeof(((struct mb_cache_entry *) 0)->e_indexes[0]))
262        return NULL;
263
264    cache = kmalloc(sizeof(struct mb_cache) +
265                    indexes_count * sizeof(struct list_head), GFP_KERNEL);
266    if (!cache)
267        goto fail;
268    cache->c_name = name;
269    cache->c_op.free = NULL;
270    if (cache_op)
271        cache->c_op.free = cache_op->free;
272    atomic_set(&cache->c_entry_count, 0);
273    cache->c_bucket_bits = bucket_bits;
274#ifdef MB_CACHE_INDEXES_COUNT
275    mb_assert(indexes_count == MB_CACHE_INDEXES_COUNT);
276#else
277    cache->c_indexes_count = indexes_count;
278#endif
279    cache->c_block_hash = kmalloc(bucket_count * sizeof(struct list_head),
280                                  GFP_KERNEL);
281    if (!cache->c_block_hash)
282        goto fail;
283    for (n=0; n<bucket_count; n++)
284        INIT_LIST_HEAD(&cache->c_block_hash[n]);
285    for (m=0; m<indexes_count; m++) {
286        cache->c_indexes_hash[m] = kmalloc(bucket_count *
287                                         sizeof(struct list_head),
288                                         GFP_KERNEL);
289        if (!cache->c_indexes_hash[m])
290            goto fail;
291        for (n=0; n<bucket_count; n++)
292            INIT_LIST_HEAD(&cache->c_indexes_hash[m][n]);
293    }
294    cache->c_entry_cache = kmem_cache_create(name, entry_size, 0,
295        SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL);
296    if (!cache->c_entry_cache)
297        goto fail;
298
299    spin_lock(&mb_cache_spinlock);
300    list_add(&cache->c_cache_list, &mb_cache_list);
301    spin_unlock(&mb_cache_spinlock);
302    return cache;
303
304fail:
305    if (cache) {
306        while (--m >= 0)
307            kfree(cache->c_indexes_hash[m]);
308        kfree(cache->c_block_hash);
309        kfree(cache);
310    }
311    return NULL;
312}
313
314
315/*
316 * mb_cache_shrink()
317 *
318 * Removes all cache entries of a device from the cache. All cache entries
319 * currently in use cannot be freed, and thus remain in the cache. All others
320 * are freed.
321 *
322 * @bdev: which device's cache entries to shrink
323 */
324void
325mb_cache_shrink(struct block_device *bdev)
326{
327    LIST_HEAD(free_list);
328    struct list_head *l, *ltmp;
329
330    spin_lock(&mb_cache_spinlock);
331    list_for_each_safe(l, ltmp, &mb_cache_lru_list) {
332        struct mb_cache_entry *ce =
333            list_entry(l, struct mb_cache_entry, e_lru_list);
334        if (ce->e_bdev == bdev) {
335            list_move_tail(&ce->e_lru_list, &free_list);
336            __mb_cache_entry_unhash(ce);
337        }
338    }
339    spin_unlock(&mb_cache_spinlock);
340    list_for_each_safe(l, ltmp, &free_list) {
341        __mb_cache_entry_forget(list_entry(l, struct mb_cache_entry,
342                           e_lru_list), GFP_KERNEL);
343    }
344}
345
346
347/*
348 * mb_cache_destroy()
349 *
350 * Shrinks the cache to its minimum possible size (hopefully 0 entries),
351 * and then destroys it. If this was the last mbcache, un-registers the
352 * mbcache from kernel memory management.
353 */
354void
355mb_cache_destroy(struct mb_cache *cache)
356{
357    LIST_HEAD(free_list);
358    struct list_head *l, *ltmp;
359    int n;
360
361    spin_lock(&mb_cache_spinlock);
362    list_for_each_safe(l, ltmp, &mb_cache_lru_list) {
363        struct mb_cache_entry *ce =
364            list_entry(l, struct mb_cache_entry, e_lru_list);
365        if (ce->e_cache == cache) {
366            list_move_tail(&ce->e_lru_list, &free_list);
367            __mb_cache_entry_unhash(ce);
368        }
369    }
370    list_del(&cache->c_cache_list);
371    spin_unlock(&mb_cache_spinlock);
372
373    list_for_each_safe(l, ltmp, &free_list) {
374        __mb_cache_entry_forget(list_entry(l, struct mb_cache_entry,
375                           e_lru_list), GFP_KERNEL);
376    }
377
378    if (atomic_read(&cache->c_entry_count) > 0) {
379        mb_error("cache %s: %d orphaned entries",
380              cache->c_name,
381              atomic_read(&cache->c_entry_count));
382    }
383
384    kmem_cache_destroy(cache->c_entry_cache);
385
386    for (n=0; n < mb_cache_indexes(cache); n++)
387        kfree(cache->c_indexes_hash[n]);
388    kfree(cache->c_block_hash);
389    kfree(cache);
390}
391
392
393/*
394 * mb_cache_entry_alloc()
395 *
396 * Allocates a new cache entry. The new entry will not be valid initially,
397 * and thus cannot be looked up yet. It should be filled with data, and
398 * then inserted into the cache using mb_cache_entry_insert(). Returns NULL
399 * if no more memory was available.
400 */
401struct mb_cache_entry *
402mb_cache_entry_alloc(struct mb_cache *cache, gfp_t gfp_flags)
403{
404    struct mb_cache_entry *ce;
405
406    ce = kmem_cache_alloc(cache->c_entry_cache, gfp_flags);
407    if (ce) {
408        atomic_inc(&cache->c_entry_count);
409        INIT_LIST_HEAD(&ce->e_lru_list);
410        INIT_LIST_HEAD(&ce->e_block_list);
411        ce->e_cache = cache;
412        ce->e_used = 1 + MB_CACHE_WRITER;
413        ce->e_queued = 0;
414    }
415    return ce;
416}
417
418
419/*
420 * mb_cache_entry_insert()
421 *
422 * Inserts an entry that was allocated using mb_cache_entry_alloc() into
423 * the cache. After this, the cache entry can be looked up, but is not yet
424 * in the lru list as the caller still holds a handle to it. Returns 0 on
425 * success, or -EBUSY if a cache entry for that device + inode exists
426 * already (this may happen after a failed lookup, but when another process
427 * has inserted the same cache entry in the meantime).
428 *
429 * @bdev: device the cache entry belongs to
430 * @block: block number
431 * @keys: array of additional keys. There must be indexes_count entries
432 * in the array (as specified when creating the cache).
433 */
434int
435mb_cache_entry_insert(struct mb_cache_entry *ce, struct block_device *bdev,
436              sector_t block, unsigned int keys[])
437{
438    struct mb_cache *cache = ce->e_cache;
439    unsigned int bucket;
440    struct list_head *l;
441    int error = -EBUSY, n;
442
443    bucket = hash_long((unsigned long)bdev + (block & 0xffffffff),
444               cache->c_bucket_bits);
445    spin_lock(&mb_cache_spinlock);
446    list_for_each_prev(l, &cache->c_block_hash[bucket]) {
447        struct mb_cache_entry *ce =
448            list_entry(l, struct mb_cache_entry, e_block_list);
449        if (ce->e_bdev == bdev && ce->e_block == block)
450            goto out;
451    }
452    __mb_cache_entry_unhash(ce);
453    ce->e_bdev = bdev;
454    ce->e_block = block;
455    list_add(&ce->e_block_list, &cache->c_block_hash[bucket]);
456    for (n=0; n<mb_cache_indexes(cache); n++) {
457        ce->e_indexes[n].o_key = keys[n];
458        bucket = hash_long(keys[n], cache->c_bucket_bits);
459        list_add(&ce->e_indexes[n].o_list,
460             &cache->c_indexes_hash[n][bucket]);
461    }
462    error = 0;
463out:
464    spin_unlock(&mb_cache_spinlock);
465    return error;
466}
467
468
469/*
470 * mb_cache_entry_release()
471 *
472 * Release a handle to a cache entry. When the last handle to a cache entry
473 * is released it is either freed (if it is invalid) or otherwise inserted
474 * in to the lru list.
475 */
476void
477mb_cache_entry_release(struct mb_cache_entry *ce)
478{
479    spin_lock(&mb_cache_spinlock);
480    __mb_cache_entry_release_unlock(ce);
481}
482
483
484/*
485 * mb_cache_entry_free()
486 *
487 * This is equivalent to the sequence mb_cache_entry_takeout() --
488 * mb_cache_entry_release().
489 */
490void
491mb_cache_entry_free(struct mb_cache_entry *ce)
492{
493    spin_lock(&mb_cache_spinlock);
494    mb_assert(list_empty(&ce->e_lru_list));
495    __mb_cache_entry_unhash(ce);
496    __mb_cache_entry_release_unlock(ce);
497}
498
499
500/*
501 * mb_cache_entry_get()
502 *
503 * Get a cache entry by device / block number. (There can only be one entry
504 * in the cache per device and block.) Returns NULL if no such cache entry
505 * exists. The returned cache entry is locked for exclusive access ("single
506 * writer").
507 */
508struct mb_cache_entry *
509mb_cache_entry_get(struct mb_cache *cache, struct block_device *bdev,
510           sector_t block)
511{
512    unsigned int bucket;
513    struct list_head *l;
514    struct mb_cache_entry *ce;
515
516    bucket = hash_long((unsigned long)bdev + (block & 0xffffffff),
517               cache->c_bucket_bits);
518    spin_lock(&mb_cache_spinlock);
519    list_for_each(l, &cache->c_block_hash[bucket]) {
520        ce = list_entry(l, struct mb_cache_entry, e_block_list);
521        if (ce->e_bdev == bdev && ce->e_block == block) {
522            DEFINE_WAIT(wait);
523
524            if (!list_empty(&ce->e_lru_list))
525                list_del_init(&ce->e_lru_list);
526
527            while (ce->e_used > 0) {
528                ce->e_queued++;
529                prepare_to_wait(&mb_cache_queue, &wait,
530                        TASK_UNINTERRUPTIBLE);
531                spin_unlock(&mb_cache_spinlock);
532                schedule();
533                spin_lock(&mb_cache_spinlock);
534                ce->e_queued--;
535            }
536            finish_wait(&mb_cache_queue, &wait);
537            ce->e_used += 1 + MB_CACHE_WRITER;
538
539            if (!__mb_cache_entry_is_hashed(ce)) {
540                __mb_cache_entry_release_unlock(ce);
541                return NULL;
542            }
543            goto cleanup;
544        }
545    }
546    ce = NULL;
547
548cleanup:
549    spin_unlock(&mb_cache_spinlock);
550    return ce;
551}
552
553#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0)
554
555static struct mb_cache_entry *
556__mb_cache_entry_find(struct list_head *l, struct list_head *head,
557              int index, struct block_device *bdev, unsigned int key)
558{
559    while (l != head) {
560        struct mb_cache_entry *ce =
561            list_entry(l, struct mb_cache_entry,
562                       e_indexes[index].o_list);
563        if (ce->e_bdev == bdev && ce->e_indexes[index].o_key == key) {
564            DEFINE_WAIT(wait);
565
566            if (!list_empty(&ce->e_lru_list))
567                list_del_init(&ce->e_lru_list);
568
569            /* Incrementing before holding the lock gives readers
570               priority over writers. */
571            ce->e_used++;
572            while (ce->e_used >= MB_CACHE_WRITER) {
573                ce->e_queued++;
574                prepare_to_wait(&mb_cache_queue, &wait,
575                        TASK_UNINTERRUPTIBLE);
576                spin_unlock(&mb_cache_spinlock);
577                schedule();
578                spin_lock(&mb_cache_spinlock);
579                ce->e_queued--;
580            }
581            finish_wait(&mb_cache_queue, &wait);
582
583            if (!__mb_cache_entry_is_hashed(ce)) {
584                __mb_cache_entry_release_unlock(ce);
585                spin_lock(&mb_cache_spinlock);
586                return ERR_PTR(-EAGAIN);
587            }
588            return ce;
589        }
590        l = l->next;
591    }
592    return NULL;
593}
594
595
596/*
597 * mb_cache_entry_find_first()
598 *
599 * Find the first cache entry on a given device with a certain key in
600 * an additional index. Additonal matches can be found with
601 * mb_cache_entry_find_next(). Returns NULL if no match was found. The
602 * returned cache entry is locked for shared access ("multiple readers").
603 *
604 * @cache: the cache to search
605 * @index: the number of the additonal index to search (0<=index<indexes_count)
606 * @bdev: the device the cache entry should belong to
607 * @key: the key in the index
608 */
609struct mb_cache_entry *
610mb_cache_entry_find_first(struct mb_cache *cache, int index,
611              struct block_device *bdev, unsigned int key)
612{
613    unsigned int bucket = hash_long(key, cache->c_bucket_bits);
614    struct list_head *l;
615    struct mb_cache_entry *ce;
616
617    mb_assert(index < mb_cache_indexes(cache));
618    spin_lock(&mb_cache_spinlock);
619    l = cache->c_indexes_hash[index][bucket].next;
620    ce = __mb_cache_entry_find(l, &cache->c_indexes_hash[index][bucket],
621                               index, bdev, key);
622    spin_unlock(&mb_cache_spinlock);
623    return ce;
624}
625
626
627/*
628 * mb_cache_entry_find_next()
629 *
630 * Find the next cache entry on a given device with a certain key in an
631 * additional index. Returns NULL if no match could be found. The previous
632 * entry is atomatically released, so that mb_cache_entry_find_next() can
633 * be called like this:
634 *
635 * entry = mb_cache_entry_find_first();
636 * while (entry) {
637 * ...
638 * entry = mb_cache_entry_find_next(entry, ...);
639 * }
640 *
641 * @prev: The previous match
642 * @index: the number of the additonal index to search (0<=index<indexes_count)
643 * @bdev: the device the cache entry should belong to
644 * @key: the key in the index
645 */
646struct mb_cache_entry *
647mb_cache_entry_find_next(struct mb_cache_entry *prev, int index,
648             struct block_device *bdev, unsigned int key)
649{
650    struct mb_cache *cache = prev->e_cache;
651    unsigned int bucket = hash_long(key, cache->c_bucket_bits);
652    struct list_head *l;
653    struct mb_cache_entry *ce;
654
655    mb_assert(index < mb_cache_indexes(cache));
656    spin_lock(&mb_cache_spinlock);
657    l = prev->e_indexes[index].o_list.next;
658    ce = __mb_cache_entry_find(l, &cache->c_indexes_hash[index][bucket],
659                               index, bdev, key);
660    __mb_cache_entry_release_unlock(prev);
661    return ce;
662}
663
664#endif /* !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0) */
665
666static int __init init_mbcache(void)
667{
668    register_shrinker(&mb_cache_shrinker);
669    return 0;
670}
671
672static void __exit exit_mbcache(void)
673{
674    unregister_shrinker(&mb_cache_shrinker);
675}
676
677module_init(init_mbcache)
678module_exit(exit_mbcache)
679
680

Archive Download this file



interactive