Root/mm/page_cgroup.c

1#include <linux/mm.h>
2#include <linux/mmzone.h>
3#include <linux/bootmem.h>
4#include <linux/bit_spinlock.h>
5#include <linux/page_cgroup.h>
6#include <linux/hash.h>
7#include <linux/slab.h>
8#include <linux/memory.h>
9#include <linux/vmalloc.h>
10#include <linux/cgroup.h>
11#include <linux/swapops.h>
12#include <linux/kmemleak.h>
13
14static void __meminit init_page_cgroup(struct page_cgroup *pc, unsigned long id)
15{
16    pc->flags = 0;
17    set_page_cgroup_array_id(pc, id);
18    pc->mem_cgroup = NULL;
19    INIT_LIST_HEAD(&pc->lru);
20}
21static unsigned long total_usage;
22
23#if !defined(CONFIG_SPARSEMEM)
24
25
26void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
27{
28    pgdat->node_page_cgroup = NULL;
29}
30
31struct page_cgroup *lookup_page_cgroup(struct page *page)
32{
33    unsigned long pfn = page_to_pfn(page);
34    unsigned long offset;
35    struct page_cgroup *base;
36
37    base = NODE_DATA(page_to_nid(page))->node_page_cgroup;
38    if (unlikely(!base))
39        return NULL;
40
41    offset = pfn - NODE_DATA(page_to_nid(page))->node_start_pfn;
42    return base + offset;
43}
44
45struct page *lookup_cgroup_page(struct page_cgroup *pc)
46{
47    unsigned long pfn;
48    struct page *page;
49    pg_data_t *pgdat;
50
51    pgdat = NODE_DATA(page_cgroup_array_id(pc));
52    pfn = pc - pgdat->node_page_cgroup + pgdat->node_start_pfn;
53    page = pfn_to_page(pfn);
54    VM_BUG_ON(pc != lookup_page_cgroup(page));
55    return page;
56}
57
58static int __init alloc_node_page_cgroup(int nid)
59{
60    struct page_cgroup *base, *pc;
61    unsigned long table_size;
62    unsigned long start_pfn, nr_pages, index;
63
64    start_pfn = NODE_DATA(nid)->node_start_pfn;
65    nr_pages = NODE_DATA(nid)->node_spanned_pages;
66
67    if (!nr_pages)
68        return 0;
69
70    table_size = sizeof(struct page_cgroup) * nr_pages;
71
72    base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
73            table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
74    if (!base)
75        return -ENOMEM;
76    for (index = 0; index < nr_pages; index++) {
77        pc = base + index;
78        init_page_cgroup(pc, nid);
79    }
80    NODE_DATA(nid)->node_page_cgroup = base;
81    total_usage += table_size;
82    return 0;
83}
84
85void __init page_cgroup_init_flatmem(void)
86{
87
88    int nid, fail;
89
90    if (mem_cgroup_disabled())
91        return;
92
93    for_each_online_node(nid) {
94        fail = alloc_node_page_cgroup(nid);
95        if (fail)
96            goto fail;
97    }
98    printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
99    printk(KERN_INFO "please try 'cgroup_disable=memory' option if you"
100    " don't want memory cgroups\n");
101    return;
102fail:
103    printk(KERN_CRIT "allocation of page_cgroup failed.\n");
104    printk(KERN_CRIT "please try 'cgroup_disable=memory' boot option\n");
105    panic("Out of memory");
106}
107
108#else /* CONFIG_FLAT_NODE_MEM_MAP */
109
110struct page_cgroup *lookup_page_cgroup(struct page *page)
111{
112    unsigned long pfn = page_to_pfn(page);
113    struct mem_section *section = __pfn_to_section(pfn);
114
115    if (!section->page_cgroup)
116        return NULL;
117    return section->page_cgroup + pfn;
118}
119
120struct page *lookup_cgroup_page(struct page_cgroup *pc)
121{
122    struct mem_section *section;
123    struct page *page;
124    unsigned long nr;
125
126    nr = page_cgroup_array_id(pc);
127    section = __nr_to_section(nr);
128    page = pfn_to_page(pc - section->page_cgroup);
129    VM_BUG_ON(pc != lookup_page_cgroup(page));
130    return page;
131}
132
133static void *__meminit alloc_page_cgroup(size_t size, int nid)
134{
135    void *addr = NULL;
136
137    addr = alloc_pages_exact_nid(nid, size, GFP_KERNEL | __GFP_NOWARN);
138    if (addr)
139        return addr;
140
141    if (node_state(nid, N_HIGH_MEMORY))
142        addr = vmalloc_node(size, nid);
143    else
144        addr = vmalloc(size);
145
146    return addr;
147}
148
149#ifdef CONFIG_MEMORY_HOTPLUG
150static void free_page_cgroup(void *addr)
151{
152    if (is_vmalloc_addr(addr)) {
153        vfree(addr);
154    } else {
155        struct page *page = virt_to_page(addr);
156        size_t table_size =
157            sizeof(struct page_cgroup) * PAGES_PER_SECTION;
158
159        BUG_ON(PageReserved(page));
160        free_pages_exact(addr, table_size);
161    }
162}
163#endif
164
165static int __meminit init_section_page_cgroup(unsigned long pfn, int nid)
166{
167    struct page_cgroup *base, *pc;
168    struct mem_section *section;
169    unsigned long table_size;
170    unsigned long nr;
171    int index;
172
173    nr = pfn_to_section_nr(pfn);
174    section = __nr_to_section(nr);
175
176    if (section->page_cgroup)
177        return 0;
178
179    table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
180    base = alloc_page_cgroup(table_size, nid);
181
182    /*
183     * The value stored in section->page_cgroup is (base - pfn)
184     * and it does not point to the memory block allocated above,
185     * causing kmemleak false positives.
186     */
187    kmemleak_not_leak(base);
188
189    if (!base) {
190        printk(KERN_ERR "page cgroup allocation failure\n");
191        return -ENOMEM;
192    }
193
194    for (index = 0; index < PAGES_PER_SECTION; index++) {
195        pc = base + index;
196        init_page_cgroup(pc, nr);
197    }
198    /*
199     * The passed "pfn" may not be aligned to SECTION. For the calculation
200     * we need to apply a mask.
201     */
202    pfn &= PAGE_SECTION_MASK;
203    section->page_cgroup = base - pfn;
204    total_usage += table_size;
205    return 0;
206}
207#ifdef CONFIG_MEMORY_HOTPLUG
208void __free_page_cgroup(unsigned long pfn)
209{
210    struct mem_section *ms;
211    struct page_cgroup *base;
212
213    ms = __pfn_to_section(pfn);
214    if (!ms || !ms->page_cgroup)
215        return;
216    base = ms->page_cgroup + pfn;
217    free_page_cgroup(base);
218    ms->page_cgroup = NULL;
219}
220
221int __meminit online_page_cgroup(unsigned long start_pfn,
222            unsigned long nr_pages,
223            int nid)
224{
225    unsigned long start, end, pfn;
226    int fail = 0;
227
228    start = start_pfn & ~(PAGES_PER_SECTION - 1);
229    end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION);
230
231    if (nid == -1) {
232        /*
233         * In this case, "nid" already exists and contains valid memory.
234         * "start_pfn" passed to us is a pfn which is an arg for
235         * online__pages(), and start_pfn should exist.
236         */
237        nid = pfn_to_nid(start_pfn);
238        VM_BUG_ON(!node_state(nid, N_ONLINE));
239    }
240
241    for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) {
242        if (!pfn_present(pfn))
243            continue;
244        fail = init_section_page_cgroup(pfn, nid);
245    }
246    if (!fail)
247        return 0;
248
249    /* rollback */
250    for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
251        __free_page_cgroup(pfn);
252
253    return -ENOMEM;
254}
255
256int __meminit offline_page_cgroup(unsigned long start_pfn,
257        unsigned long nr_pages, int nid)
258{
259    unsigned long start, end, pfn;
260
261    start = start_pfn & ~(PAGES_PER_SECTION - 1);
262    end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION);
263
264    for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
265        __free_page_cgroup(pfn);
266    return 0;
267
268}
269
270static int __meminit page_cgroup_callback(struct notifier_block *self,
271                   unsigned long action, void *arg)
272{
273    struct memory_notify *mn = arg;
274    int ret = 0;
275    switch (action) {
276    case MEM_GOING_ONLINE:
277        ret = online_page_cgroup(mn->start_pfn,
278                   mn->nr_pages, mn->status_change_nid);
279        break;
280    case MEM_OFFLINE:
281        offline_page_cgroup(mn->start_pfn,
282                mn->nr_pages, mn->status_change_nid);
283        break;
284    case MEM_CANCEL_ONLINE:
285    case MEM_GOING_OFFLINE:
286        break;
287    case MEM_ONLINE:
288    case MEM_CANCEL_OFFLINE:
289        break;
290    }
291
292    return notifier_from_errno(ret);
293}
294
295#endif
296
297void __init page_cgroup_init(void)
298{
299    unsigned long pfn;
300    int nid;
301
302    if (mem_cgroup_disabled())
303        return;
304
305    for_each_node_state(nid, N_HIGH_MEMORY) {
306        unsigned long start_pfn, end_pfn;
307
308        start_pfn = node_start_pfn(nid);
309        end_pfn = node_end_pfn(nid);
310        /*
311         * start_pfn and end_pfn may not be aligned to SECTION and the
312         * page->flags of out of node pages are not initialized. So we
313         * scan [start_pfn, the biggest section's pfn < end_pfn) here.
314         */
315        for (pfn = start_pfn;
316             pfn < end_pfn;
317                     pfn = ALIGN(pfn + 1, PAGES_PER_SECTION)) {
318
319            if (!pfn_valid(pfn))
320                continue;
321            /*
322             * Nodes's pfns can be overlapping.
323             * We know some arch can have a nodes layout such as
324             * -------------pfn-------------->
325             * N0 | N1 | N2 | N0 | N1 | N2|....
326             */
327            if (pfn_to_nid(pfn) != nid)
328                continue;
329            if (init_section_page_cgroup(pfn, nid))
330                goto oom;
331        }
332    }
333    hotplug_memory_notifier(page_cgroup_callback, 0);
334    printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
335    printk(KERN_INFO "please try 'cgroup_disable=memory' option if you "
336             "don't want memory cgroups\n");
337    return;
338oom:
339    printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n");
340    panic("Out of memory");
341}
342
343void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
344{
345    return;
346}
347
348#endif
349
350
351#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
352
353static DEFINE_MUTEX(swap_cgroup_mutex);
354struct swap_cgroup_ctrl {
355    struct page **map;
356    unsigned long length;
357    spinlock_t lock;
358};
359
360struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES];
361
362struct swap_cgroup {
363    unsigned short id;
364};
365#define SC_PER_PAGE (PAGE_SIZE/sizeof(struct swap_cgroup))
366#define SC_POS_MASK (SC_PER_PAGE - 1)
367
368/*
369 * SwapCgroup implements "lookup" and "exchange" operations.
370 * In typical usage, this swap_cgroup is accessed via memcg's charge/uncharge
371 * against SwapCache. At swap_free(), this is accessed directly from swap.
372 *
373 * This means,
374 * - we have no race in "exchange" when we're accessed via SwapCache because
375 * SwapCache(and its swp_entry) is under lock.
376 * - When called via swap_free(), there is no user of this entry and no race.
377 * Then, we don't need lock around "exchange".
378 *
379 * TODO: we can push these buffers out to HIGHMEM.
380 */
381
382/*
383 * allocate buffer for swap_cgroup.
384 */
385static int swap_cgroup_prepare(int type)
386{
387    struct page *page;
388    struct swap_cgroup_ctrl *ctrl;
389    unsigned long idx, max;
390
391    ctrl = &swap_cgroup_ctrl[type];
392
393    for (idx = 0; idx < ctrl->length; idx++) {
394        page = alloc_page(GFP_KERNEL | __GFP_ZERO);
395        if (!page)
396            goto not_enough_page;
397        ctrl->map[idx] = page;
398    }
399    return 0;
400not_enough_page:
401    max = idx;
402    for (idx = 0; idx < max; idx++)
403        __free_page(ctrl->map[idx]);
404
405    return -ENOMEM;
406}
407
408/**
409 * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry.
410 * @end: swap entry to be cmpxchged
411 * @old: old id
412 * @new: new id
413 *
414 * Returns old id at success, 0 at failure.
415 * (There is no mem_cgroup using 0 as its id)
416 */
417unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
418                    unsigned short old, unsigned short new)
419{
420    int type = swp_type(ent);
421    unsigned long offset = swp_offset(ent);
422    unsigned long idx = offset / SC_PER_PAGE;
423    unsigned long pos = offset & SC_POS_MASK;
424    struct swap_cgroup_ctrl *ctrl;
425    struct page *mappage;
426    struct swap_cgroup *sc;
427    unsigned long flags;
428    unsigned short retval;
429
430    ctrl = &swap_cgroup_ctrl[type];
431
432    mappage = ctrl->map[idx];
433    sc = page_address(mappage);
434    sc += pos;
435    spin_lock_irqsave(&ctrl->lock, flags);
436    retval = sc->id;
437    if (retval == old)
438        sc->id = new;
439    else
440        retval = 0;
441    spin_unlock_irqrestore(&ctrl->lock, flags);
442    return retval;
443}
444
445/**
446 * swap_cgroup_record - record mem_cgroup for this swp_entry.
447 * @ent: swap entry to be recorded into
448 * @mem: mem_cgroup to be recorded
449 *
450 * Returns old value at success, 0 at failure.
451 * (Of course, old value can be 0.)
452 */
453unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
454{
455    int type = swp_type(ent);
456    unsigned long offset = swp_offset(ent);
457    unsigned long idx = offset / SC_PER_PAGE;
458    unsigned long pos = offset & SC_POS_MASK;
459    struct swap_cgroup_ctrl *ctrl;
460    struct page *mappage;
461    struct swap_cgroup *sc;
462    unsigned short old;
463    unsigned long flags;
464
465    ctrl = &swap_cgroup_ctrl[type];
466
467    mappage = ctrl->map[idx];
468    sc = page_address(mappage);
469    sc += pos;
470    spin_lock_irqsave(&ctrl->lock, flags);
471    old = sc->id;
472    sc->id = id;
473    spin_unlock_irqrestore(&ctrl->lock, flags);
474
475    return old;
476}
477
478/**
479 * lookup_swap_cgroup - lookup mem_cgroup tied to swap entry
480 * @ent: swap entry to be looked up.
481 *
482 * Returns CSS ID of mem_cgroup at success. 0 at failure. (0 is invalid ID)
483 */
484unsigned short lookup_swap_cgroup(swp_entry_t ent)
485{
486    int type = swp_type(ent);
487    unsigned long offset = swp_offset(ent);
488    unsigned long idx = offset / SC_PER_PAGE;
489    unsigned long pos = offset & SC_POS_MASK;
490    struct swap_cgroup_ctrl *ctrl;
491    struct page *mappage;
492    struct swap_cgroup *sc;
493    unsigned short ret;
494
495    ctrl = &swap_cgroup_ctrl[type];
496    mappage = ctrl->map[idx];
497    sc = page_address(mappage);
498    sc += pos;
499    ret = sc->id;
500    return ret;
501}
502
503int swap_cgroup_swapon(int type, unsigned long max_pages)
504{
505    void *array;
506    unsigned long array_size;
507    unsigned long length;
508    struct swap_cgroup_ctrl *ctrl;
509
510    if (!do_swap_account)
511        return 0;
512
513    length = DIV_ROUND_UP(max_pages, SC_PER_PAGE);
514    array_size = length * sizeof(void *);
515
516    array = vmalloc(array_size);
517    if (!array)
518        goto nomem;
519
520    memset(array, 0, array_size);
521    ctrl = &swap_cgroup_ctrl[type];
522    mutex_lock(&swap_cgroup_mutex);
523    ctrl->length = length;
524    ctrl->map = array;
525    spin_lock_init(&ctrl->lock);
526    if (swap_cgroup_prepare(type)) {
527        /* memory shortage */
528        ctrl->map = NULL;
529        ctrl->length = 0;
530        mutex_unlock(&swap_cgroup_mutex);
531        vfree(array);
532        goto nomem;
533    }
534    mutex_unlock(&swap_cgroup_mutex);
535
536    return 0;
537nomem:
538    printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n");
539    printk(KERN_INFO
540        "swap_cgroup can be disabled by noswapaccount boot option\n");
541    return -ENOMEM;
542}
543
544void swap_cgroup_swapoff(int type)
545{
546    struct page **map;
547    unsigned long i, length;
548    struct swap_cgroup_ctrl *ctrl;
549
550    if (!do_swap_account)
551        return;
552
553    mutex_lock(&swap_cgroup_mutex);
554    ctrl = &swap_cgroup_ctrl[type];
555    map = ctrl->map;
556    length = ctrl->length;
557    ctrl->map = NULL;
558    ctrl->length = 0;
559    mutex_unlock(&swap_cgroup_mutex);
560
561    if (map) {
562        for (i = 0; i < length; i++) {
563            struct page *page = map[i];
564            if (page)
565                __free_page(page);
566        }
567        vfree(map);
568    }
569}
570
571#endif
572

Archive Download this file



interactive