Root/drivers/iommu/intel-iommu.c

1/*
2 * Copyright (c) 2006, Intel Corporation.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
12 *
13 * You should have received a copy of the GNU General Public License along with
14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15 * Place - Suite 330, Boston, MA 02111-1307 USA.
16 *
17 * Copyright (C) 2006-2008 Intel Corporation
18 * Author: Ashok Raj <ashok.raj@intel.com>
19 * Author: Shaohua Li <shaohua.li@intel.com>
20 * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21 * Author: Fenghua Yu <fenghua.yu@intel.com>
22 */
23
24#include <linux/init.h>
25#include <linux/bitmap.h>
26#include <linux/debugfs.h>
27#include <linux/export.h>
28#include <linux/slab.h>
29#include <linux/irq.h>
30#include <linux/interrupt.h>
31#include <linux/spinlock.h>
32#include <linux/pci.h>
33#include <linux/dmar.h>
34#include <linux/dma-mapping.h>
35#include <linux/mempool.h>
36#include <linux/timer.h>
37#include <linux/iova.h>
38#include <linux/iommu.h>
39#include <linux/intel-iommu.h>
40#include <linux/syscore_ops.h>
41#include <linux/tboot.h>
42#include <linux/dmi.h>
43#include <linux/pci-ats.h>
44#include <linux/memblock.h>
45#include <asm/irq_remapping.h>
46#include <asm/cacheflush.h>
47#include <asm/iommu.h>
48
49#define ROOT_SIZE VTD_PAGE_SIZE
50#define CONTEXT_SIZE VTD_PAGE_SIZE
51
52#define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
53#define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
54#define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
55
56#define IOAPIC_RANGE_START (0xfee00000)
57#define IOAPIC_RANGE_END (0xfeefffff)
58#define IOVA_START_ADDR (0x1000)
59
60#define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
61
62#define MAX_AGAW_WIDTH 64
63
64#define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
65#define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
66
67/* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
68   to match. That way, we can use 'unsigned long' for PFNs with impunity. */
69#define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
70                __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
71#define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
72
73#define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
74#define DMA_32BIT_PFN IOVA_PFN(DMA_BIT_MASK(32))
75#define DMA_64BIT_PFN IOVA_PFN(DMA_BIT_MASK(64))
76
77/* page table handling */
78#define LEVEL_STRIDE (9)
79#define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
80
81/*
82 * This bitmap is used to advertise the page sizes our hardware support
83 * to the IOMMU core, which will then use this information to split
84 * physically contiguous memory regions it is mapping into page sizes
85 * that we support.
86 *
87 * Traditionally the IOMMU core just handed us the mappings directly,
88 * after making sure the size is an order of a 4KiB page and that the
89 * mapping has natural alignment.
90 *
91 * To retain this behavior, we currently advertise that we support
92 * all page sizes that are an order of 4KiB.
93 *
94 * If at some point we'd like to utilize the IOMMU core's new behavior,
95 * we could change this to advertise the real page sizes we support.
96 */
97#define INTEL_IOMMU_PGSIZES (~0xFFFUL)
98
99static inline int agaw_to_level(int agaw)
100{
101    return agaw + 2;
102}
103
104static inline int agaw_to_width(int agaw)
105{
106    return 30 + agaw * LEVEL_STRIDE;
107}
108
109static inline int width_to_agaw(int width)
110{
111    return (width - 30) / LEVEL_STRIDE;
112}
113
114static inline unsigned int level_to_offset_bits(int level)
115{
116    return (level - 1) * LEVEL_STRIDE;
117}
118
119static inline int pfn_level_offset(unsigned long pfn, int level)
120{
121    return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
122}
123
124static inline unsigned long level_mask(int level)
125{
126    return -1UL << level_to_offset_bits(level);
127}
128
129static inline unsigned long level_size(int level)
130{
131    return 1UL << level_to_offset_bits(level);
132}
133
134static inline unsigned long align_to_level(unsigned long pfn, int level)
135{
136    return (pfn + level_size(level) - 1) & level_mask(level);
137}
138
139static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
140{
141    return 1 << ((lvl - 1) * LEVEL_STRIDE);
142}
143
144/* VT-d pages must always be _smaller_ than MM pages. Otherwise things
145   are never going to work. */
146static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
147{
148    return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
149}
150
151static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
152{
153    return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
154}
155static inline unsigned long page_to_dma_pfn(struct page *pg)
156{
157    return mm_to_dma_pfn(page_to_pfn(pg));
158}
159static inline unsigned long virt_to_dma_pfn(void *p)
160{
161    return page_to_dma_pfn(virt_to_page(p));
162}
163
164/* global iommu list, set NULL for ignored DMAR units */
165static struct intel_iommu **g_iommus;
166
167static void __init check_tylersburg_isoch(void);
168static int rwbf_quirk;
169
170/*
171 * set to 1 to panic kernel if can't successfully enable VT-d
172 * (used when kernel is launched w/ TXT)
173 */
174static int force_on = 0;
175
176/*
177 * 0: Present
178 * 1-11: Reserved
179 * 12-63: Context Ptr (12 - (haw-1))
180 * 64-127: Reserved
181 */
182struct root_entry {
183    u64 val;
184    u64 rsvd1;
185};
186#define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
187static inline bool root_present(struct root_entry *root)
188{
189    return (root->val & 1);
190}
191static inline void set_root_present(struct root_entry *root)
192{
193    root->val |= 1;
194}
195static inline void set_root_value(struct root_entry *root, unsigned long value)
196{
197    root->val |= value & VTD_PAGE_MASK;
198}
199
200static inline struct context_entry *
201get_context_addr_from_root(struct root_entry *root)
202{
203    return (struct context_entry *)
204        (root_present(root)?phys_to_virt(
205        root->val & VTD_PAGE_MASK) :
206        NULL);
207}
208
209/*
210 * low 64 bits:
211 * 0: present
212 * 1: fault processing disable
213 * 2-3: translation type
214 * 12-63: address space root
215 * high 64 bits:
216 * 0-2: address width
217 * 3-6: aval
218 * 8-23: domain id
219 */
220struct context_entry {
221    u64 lo;
222    u64 hi;
223};
224
225static inline bool context_present(struct context_entry *context)
226{
227    return (context->lo & 1);
228}
229static inline void context_set_present(struct context_entry *context)
230{
231    context->lo |= 1;
232}
233
234static inline void context_set_fault_enable(struct context_entry *context)
235{
236    context->lo &= (((u64)-1) << 2) | 1;
237}
238
239static inline void context_set_translation_type(struct context_entry *context,
240                        unsigned long value)
241{
242    context->lo &= (((u64)-1) << 4) | 3;
243    context->lo |= (value & 3) << 2;
244}
245
246static inline void context_set_address_root(struct context_entry *context,
247                        unsigned long value)
248{
249    context->lo |= value & VTD_PAGE_MASK;
250}
251
252static inline void context_set_address_width(struct context_entry *context,
253                         unsigned long value)
254{
255    context->hi |= value & 7;
256}
257
258static inline void context_set_domain_id(struct context_entry *context,
259                     unsigned long value)
260{
261    context->hi |= (value & ((1 << 16) - 1)) << 8;
262}
263
264static inline void context_clear_entry(struct context_entry *context)
265{
266    context->lo = 0;
267    context->hi = 0;
268}
269
270/*
271 * 0: readable
272 * 1: writable
273 * 2-6: reserved
274 * 7: super page
275 * 8-10: available
276 * 11: snoop behavior
277 * 12-63: Host physcial address
278 */
279struct dma_pte {
280    u64 val;
281};
282
283static inline void dma_clear_pte(struct dma_pte *pte)
284{
285    pte->val = 0;
286}
287
288static inline void dma_set_pte_readable(struct dma_pte *pte)
289{
290    pte->val |= DMA_PTE_READ;
291}
292
293static inline void dma_set_pte_writable(struct dma_pte *pte)
294{
295    pte->val |= DMA_PTE_WRITE;
296}
297
298static inline void dma_set_pte_snp(struct dma_pte *pte)
299{
300    pte->val |= DMA_PTE_SNP;
301}
302
303static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
304{
305    pte->val = (pte->val & ~3) | (prot & 3);
306}
307
308static inline u64 dma_pte_addr(struct dma_pte *pte)
309{
310#ifdef CONFIG_64BIT
311    return pte->val & VTD_PAGE_MASK;
312#else
313    /* Must have a full atomic 64-bit read */
314    return __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
315#endif
316}
317
318static inline void dma_set_pte_pfn(struct dma_pte *pte, unsigned long pfn)
319{
320    pte->val |= (uint64_t)pfn << VTD_PAGE_SHIFT;
321}
322
323static inline bool dma_pte_present(struct dma_pte *pte)
324{
325    return (pte->val & 3) != 0;
326}
327
328static inline bool dma_pte_superpage(struct dma_pte *pte)
329{
330    return (pte->val & (1 << 7));
331}
332
333static inline int first_pte_in_page(struct dma_pte *pte)
334{
335    return !((unsigned long)pte & ~VTD_PAGE_MASK);
336}
337
338/*
339 * This domain is a statically identity mapping domain.
340 * 1. This domain creats a static 1:1 mapping to all usable memory.
341 * 2. It maps to each iommu if successful.
342 * 3. Each iommu mapps to this domain if successful.
343 */
344static struct dmar_domain *si_domain;
345static int hw_pass_through = 1;
346
347/* devices under the same p2p bridge are owned in one domain */
348#define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
349
350/* domain represents a virtual machine, more than one devices
351 * across iommus may be owned in one domain, e.g. kvm guest.
352 */
353#define DOMAIN_FLAG_VIRTUAL_MACHINE (1 << 1)
354
355/* si_domain contains mulitple devices */
356#define DOMAIN_FLAG_STATIC_IDENTITY (1 << 2)
357
358/* define the limit of IOMMUs supported in each domain */
359#ifdef CONFIG_X86
360# define IOMMU_UNITS_SUPPORTED MAX_IO_APICS
361#else
362# define IOMMU_UNITS_SUPPORTED 64
363#endif
364
365struct dmar_domain {
366    int id; /* domain id */
367    int nid; /* node id */
368    DECLARE_BITMAP(iommu_bmp, IOMMU_UNITS_SUPPORTED);
369                    /* bitmap of iommus this domain uses*/
370
371    struct list_head devices; /* all devices' list */
372    struct iova_domain iovad; /* iova's that belong to this domain */
373
374    struct dma_pte *pgd; /* virtual address */
375    int gaw; /* max guest address width */
376
377    /* adjusted guest address width, 0 is level 2 30-bit */
378    int agaw;
379
380    int flags; /* flags to find out type of domain */
381
382    int iommu_coherency;/* indicate coherency of iommu access */
383    int iommu_snooping; /* indicate snooping control feature*/
384    int iommu_count; /* reference count of iommu */
385    int iommu_superpage;/* Level of superpages supported:
386                       0 == 4KiB (no superpages), 1 == 2MiB,
387                       2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
388    spinlock_t iommu_lock; /* protect iommu set in domain */
389    u64 max_addr; /* maximum mapped address */
390};
391
392/* PCI domain-device relationship */
393struct device_domain_info {
394    struct list_head link; /* link to domain siblings */
395    struct list_head global; /* link to global list */
396    int segment; /* PCI domain */
397    u8 bus; /* PCI bus number */
398    u8 devfn; /* PCI devfn number */
399    struct pci_dev *dev; /* it's NULL for PCIe-to-PCI bridge */
400    struct intel_iommu *iommu; /* IOMMU used by this device */
401    struct dmar_domain *domain; /* pointer to domain */
402};
403
404static void flush_unmaps_timeout(unsigned long data);
405
406DEFINE_TIMER(unmap_timer, flush_unmaps_timeout, 0, 0);
407
408#define HIGH_WATER_MARK 250
409struct deferred_flush_tables {
410    int next;
411    struct iova *iova[HIGH_WATER_MARK];
412    struct dmar_domain *domain[HIGH_WATER_MARK];
413};
414
415static struct deferred_flush_tables *deferred_flush;
416
417/* bitmap for indexing intel_iommus */
418static int g_num_of_iommus;
419
420static DEFINE_SPINLOCK(async_umap_flush_lock);
421static LIST_HEAD(unmaps_to_do);
422
423static int timer_on;
424static long list_size;
425
426static void domain_remove_dev_info(struct dmar_domain *domain);
427
428#ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
429int dmar_disabled = 0;
430#else
431int dmar_disabled = 1;
432#endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
433
434int intel_iommu_enabled = 0;
435EXPORT_SYMBOL_GPL(intel_iommu_enabled);
436
437static int dmar_map_gfx = 1;
438static int dmar_forcedac;
439static int intel_iommu_strict;
440static int intel_iommu_superpage = 1;
441
442int intel_iommu_gfx_mapped;
443EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
444
445#define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
446static DEFINE_SPINLOCK(device_domain_lock);
447static LIST_HEAD(device_domain_list);
448
449static struct iommu_ops intel_iommu_ops;
450
451static int __init intel_iommu_setup(char *str)
452{
453    if (!str)
454        return -EINVAL;
455    while (*str) {
456        if (!strncmp(str, "on", 2)) {
457            dmar_disabled = 0;
458            printk(KERN_INFO "Intel-IOMMU: enabled\n");
459        } else if (!strncmp(str, "off", 3)) {
460            dmar_disabled = 1;
461            printk(KERN_INFO "Intel-IOMMU: disabled\n");
462        } else if (!strncmp(str, "igfx_off", 8)) {
463            dmar_map_gfx = 0;
464            printk(KERN_INFO
465                "Intel-IOMMU: disable GFX device mapping\n");
466        } else if (!strncmp(str, "forcedac", 8)) {
467            printk(KERN_INFO
468                "Intel-IOMMU: Forcing DAC for PCI devices\n");
469            dmar_forcedac = 1;
470        } else if (!strncmp(str, "strict", 6)) {
471            printk(KERN_INFO
472                "Intel-IOMMU: disable batched IOTLB flush\n");
473            intel_iommu_strict = 1;
474        } else if (!strncmp(str, "sp_off", 6)) {
475            printk(KERN_INFO
476                "Intel-IOMMU: disable supported super page\n");
477            intel_iommu_superpage = 0;
478        }
479
480        str += strcspn(str, ",");
481        while (*str == ',')
482            str++;
483    }
484    return 0;
485}
486__setup("intel_iommu=", intel_iommu_setup);
487
488static struct kmem_cache *iommu_domain_cache;
489static struct kmem_cache *iommu_devinfo_cache;
490static struct kmem_cache *iommu_iova_cache;
491
492static inline void *alloc_pgtable_page(int node)
493{
494    struct page *page;
495    void *vaddr = NULL;
496
497    page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
498    if (page)
499        vaddr = page_address(page);
500    return vaddr;
501}
502
503static inline void free_pgtable_page(void *vaddr)
504{
505    free_page((unsigned long)vaddr);
506}
507
508static inline void *alloc_domain_mem(void)
509{
510    return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
511}
512
513static void free_domain_mem(void *vaddr)
514{
515    kmem_cache_free(iommu_domain_cache, vaddr);
516}
517
518static inline void * alloc_devinfo_mem(void)
519{
520    return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
521}
522
523static inline void free_devinfo_mem(void *vaddr)
524{
525    kmem_cache_free(iommu_devinfo_cache, vaddr);
526}
527
528struct iova *alloc_iova_mem(void)
529{
530    return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
531}
532
533void free_iova_mem(struct iova *iova)
534{
535    kmem_cache_free(iommu_iova_cache, iova);
536}
537
538
539static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
540{
541    unsigned long sagaw;
542    int agaw = -1;
543
544    sagaw = cap_sagaw(iommu->cap);
545    for (agaw = width_to_agaw(max_gaw);
546         agaw >= 0; agaw--) {
547        if (test_bit(agaw, &sagaw))
548            break;
549    }
550
551    return agaw;
552}
553
554/*
555 * Calculate max SAGAW for each iommu.
556 */
557int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
558{
559    return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
560}
561
562/*
563 * calculate agaw for each iommu.
564 * "SAGAW" may be different across iommus, use a default agaw, and
565 * get a supported less agaw for iommus that don't support the default agaw.
566 */
567int iommu_calculate_agaw(struct intel_iommu *iommu)
568{
569    return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
570}
571
572/* This functionin only returns single iommu in a domain */
573static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
574{
575    int iommu_id;
576
577    /* si_domain and vm domain should not get here. */
578    BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
579    BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY);
580
581    iommu_id = find_first_bit(domain->iommu_bmp, g_num_of_iommus);
582    if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
583        return NULL;
584
585    return g_iommus[iommu_id];
586}
587
588static void domain_update_iommu_coherency(struct dmar_domain *domain)
589{
590    int i;
591
592    domain->iommu_coherency = 1;
593
594    for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
595        if (!ecap_coherent(g_iommus[i]->ecap)) {
596            domain->iommu_coherency = 0;
597            break;
598        }
599    }
600}
601
602static void domain_update_iommu_snooping(struct dmar_domain *domain)
603{
604    int i;
605
606    domain->iommu_snooping = 1;
607
608    for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
609        if (!ecap_sc_support(g_iommus[i]->ecap)) {
610            domain->iommu_snooping = 0;
611            break;
612        }
613    }
614}
615
616static void domain_update_iommu_superpage(struct dmar_domain *domain)
617{
618    struct dmar_drhd_unit *drhd;
619    struct intel_iommu *iommu = NULL;
620    int mask = 0xf;
621
622    if (!intel_iommu_superpage) {
623        domain->iommu_superpage = 0;
624        return;
625    }
626
627    /* set iommu_superpage to the smallest common denominator */
628    for_each_active_iommu(iommu, drhd) {
629        mask &= cap_super_page_val(iommu->cap);
630        if (!mask) {
631            break;
632        }
633    }
634    domain->iommu_superpage = fls(mask);
635}
636
637/* Some capabilities may be different across iommus */
638static void domain_update_iommu_cap(struct dmar_domain *domain)
639{
640    domain_update_iommu_coherency(domain);
641    domain_update_iommu_snooping(domain);
642    domain_update_iommu_superpage(domain);
643}
644
645static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
646{
647    struct dmar_drhd_unit *drhd = NULL;
648    int i;
649
650    for_each_drhd_unit(drhd) {
651        if (drhd->ignored)
652            continue;
653        if (segment != drhd->segment)
654            continue;
655
656        for (i = 0; i < drhd->devices_cnt; i++) {
657            if (drhd->devices[i] &&
658                drhd->devices[i]->bus->number == bus &&
659                drhd->devices[i]->devfn == devfn)
660                return drhd->iommu;
661            if (drhd->devices[i] &&
662                drhd->devices[i]->subordinate &&
663                drhd->devices[i]->subordinate->number <= bus &&
664                drhd->devices[i]->subordinate->busn_res.end >= bus)
665                return drhd->iommu;
666        }
667
668        if (drhd->include_all)
669            return drhd->iommu;
670    }
671
672    return NULL;
673}
674
675static void domain_flush_cache(struct dmar_domain *domain,
676                   void *addr, int size)
677{
678    if (!domain->iommu_coherency)
679        clflush_cache_range(addr, size);
680}
681
682/* Gets context entry for a given bus and devfn */
683static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
684        u8 bus, u8 devfn)
685{
686    struct root_entry *root;
687    struct context_entry *context;
688    unsigned long phy_addr;
689    unsigned long flags;
690
691    spin_lock_irqsave(&iommu->lock, flags);
692    root = &iommu->root_entry[bus];
693    context = get_context_addr_from_root(root);
694    if (!context) {
695        context = (struct context_entry *)
696                alloc_pgtable_page(iommu->node);
697        if (!context) {
698            spin_unlock_irqrestore(&iommu->lock, flags);
699            return NULL;
700        }
701        __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
702        phy_addr = virt_to_phys((void *)context);
703        set_root_value(root, phy_addr);
704        set_root_present(root);
705        __iommu_flush_cache(iommu, root, sizeof(*root));
706    }
707    spin_unlock_irqrestore(&iommu->lock, flags);
708    return &context[devfn];
709}
710
711static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
712{
713    struct root_entry *root;
714    struct context_entry *context;
715    int ret;
716    unsigned long flags;
717
718    spin_lock_irqsave(&iommu->lock, flags);
719    root = &iommu->root_entry[bus];
720    context = get_context_addr_from_root(root);
721    if (!context) {
722        ret = 0;
723        goto out;
724    }
725    ret = context_present(&context[devfn]);
726out:
727    spin_unlock_irqrestore(&iommu->lock, flags);
728    return ret;
729}
730
731static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
732{
733    struct root_entry *root;
734    struct context_entry *context;
735    unsigned long flags;
736
737    spin_lock_irqsave(&iommu->lock, flags);
738    root = &iommu->root_entry[bus];
739    context = get_context_addr_from_root(root);
740    if (context) {
741        context_clear_entry(&context[devfn]);
742        __iommu_flush_cache(iommu, &context[devfn], \
743            sizeof(*context));
744    }
745    spin_unlock_irqrestore(&iommu->lock, flags);
746}
747
748static void free_context_table(struct intel_iommu *iommu)
749{
750    struct root_entry *root;
751    int i;
752    unsigned long flags;
753    struct context_entry *context;
754
755    spin_lock_irqsave(&iommu->lock, flags);
756    if (!iommu->root_entry) {
757        goto out;
758    }
759    for (i = 0; i < ROOT_ENTRY_NR; i++) {
760        root = &iommu->root_entry[i];
761        context = get_context_addr_from_root(root);
762        if (context)
763            free_pgtable_page(context);
764    }
765    free_pgtable_page(iommu->root_entry);
766    iommu->root_entry = NULL;
767out:
768    spin_unlock_irqrestore(&iommu->lock, flags);
769}
770
771static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
772                      unsigned long pfn, int target_level)
773{
774    int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
775    struct dma_pte *parent, *pte = NULL;
776    int level = agaw_to_level(domain->agaw);
777    int offset;
778
779    BUG_ON(!domain->pgd);
780    BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width);
781    parent = domain->pgd;
782
783    while (level > 0) {
784        void *tmp_page;
785
786        offset = pfn_level_offset(pfn, level);
787        pte = &parent[offset];
788        if (!target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
789            break;
790        if (level == target_level)
791            break;
792
793        if (!dma_pte_present(pte)) {
794            uint64_t pteval;
795
796            tmp_page = alloc_pgtable_page(domain->nid);
797
798            if (!tmp_page)
799                return NULL;
800
801            domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
802            pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
803            if (cmpxchg64(&pte->val, 0ULL, pteval)) {
804                /* Someone else set it while we were thinking; use theirs. */
805                free_pgtable_page(tmp_page);
806            } else {
807                dma_pte_addr(pte);
808                domain_flush_cache(domain, pte, sizeof(*pte));
809            }
810        }
811        parent = phys_to_virt(dma_pte_addr(pte));
812        level--;
813    }
814
815    return pte;
816}
817
818
819/* return address's pte at specific level */
820static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
821                     unsigned long pfn,
822                     int level, int *large_page)
823{
824    struct dma_pte *parent, *pte = NULL;
825    int total = agaw_to_level(domain->agaw);
826    int offset;
827
828    parent = domain->pgd;
829    while (level <= total) {
830        offset = pfn_level_offset(pfn, total);
831        pte = &parent[offset];
832        if (level == total)
833            return pte;
834
835        if (!dma_pte_present(pte)) {
836            *large_page = total;
837            break;
838        }
839
840        if (pte->val & DMA_PTE_LARGE_PAGE) {
841            *large_page = total;
842            return pte;
843        }
844
845        parent = phys_to_virt(dma_pte_addr(pte));
846        total--;
847    }
848    return NULL;
849}
850
851/* clear last level pte, a tlb flush should be followed */
852static int dma_pte_clear_range(struct dmar_domain *domain,
853                unsigned long start_pfn,
854                unsigned long last_pfn)
855{
856    int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
857    unsigned int large_page = 1;
858    struct dma_pte *first_pte, *pte;
859    int order;
860
861    BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
862    BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
863    BUG_ON(start_pfn > last_pfn);
864
865    /* we don't need lock here; nobody else touches the iova range */
866    do {
867        large_page = 1;
868        first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
869        if (!pte) {
870            start_pfn = align_to_level(start_pfn + 1, large_page + 1);
871            continue;
872        }
873        do {
874            dma_clear_pte(pte);
875            start_pfn += lvl_to_nr_pages(large_page);
876            pte++;
877        } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
878
879        domain_flush_cache(domain, first_pte,
880                   (void *)pte - (void *)first_pte);
881
882    } while (start_pfn && start_pfn <= last_pfn);
883
884    order = (large_page - 1) * 9;
885    return order;
886}
887
888/* free page table pages. last level pte should already be cleared */
889static void dma_pte_free_pagetable(struct dmar_domain *domain,
890                   unsigned long start_pfn,
891                   unsigned long last_pfn)
892{
893    int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
894    struct dma_pte *first_pte, *pte;
895    int total = agaw_to_level(domain->agaw);
896    int level;
897    unsigned long tmp;
898    int large_page = 2;
899
900    BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
901    BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
902    BUG_ON(start_pfn > last_pfn);
903
904    /* We don't need lock here; nobody else touches the iova range */
905    level = 2;
906    while (level <= total) {
907        tmp = align_to_level(start_pfn, level);
908
909        /* If we can't even clear one PTE at this level, we're done */
910        if (tmp + level_size(level) - 1 > last_pfn)
911            return;
912
913        do {
914            large_page = level;
915            first_pte = pte = dma_pfn_level_pte(domain, tmp, level, &large_page);
916            if (large_page > level)
917                level = large_page + 1;
918            if (!pte) {
919                tmp = align_to_level(tmp + 1, level + 1);
920                continue;
921            }
922            do {
923                if (dma_pte_present(pte)) {
924                    free_pgtable_page(phys_to_virt(dma_pte_addr(pte)));
925                    dma_clear_pte(pte);
926                }
927                pte++;
928                tmp += level_size(level);
929            } while (!first_pte_in_page(pte) &&
930                 tmp + level_size(level) - 1 <= last_pfn);
931
932            domain_flush_cache(domain, first_pte,
933                       (void *)pte - (void *)first_pte);
934            
935        } while (tmp && tmp + level_size(level) - 1 <= last_pfn);
936        level++;
937    }
938    /* free pgd */
939    if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
940        free_pgtable_page(domain->pgd);
941        domain->pgd = NULL;
942    }
943}
944
945/* iommu handling */
946static int iommu_alloc_root_entry(struct intel_iommu *iommu)
947{
948    struct root_entry *root;
949    unsigned long flags;
950
951    root = (struct root_entry *)alloc_pgtable_page(iommu->node);
952    if (!root)
953        return -ENOMEM;
954
955    __iommu_flush_cache(iommu, root, ROOT_SIZE);
956
957    spin_lock_irqsave(&iommu->lock, flags);
958    iommu->root_entry = root;
959    spin_unlock_irqrestore(&iommu->lock, flags);
960
961    return 0;
962}
963
964static void iommu_set_root_entry(struct intel_iommu *iommu)
965{
966    void *addr;
967    u32 sts;
968    unsigned long flag;
969
970    addr = iommu->root_entry;
971
972    raw_spin_lock_irqsave(&iommu->register_lock, flag);
973    dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
974
975    writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
976
977    /* Make sure hardware complete it */
978    IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
979              readl, (sts & DMA_GSTS_RTPS), sts);
980
981    raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
982}
983
984static void iommu_flush_write_buffer(struct intel_iommu *iommu)
985{
986    u32 val;
987    unsigned long flag;
988
989    if (!rwbf_quirk && !cap_rwbf(iommu->cap))
990        return;
991
992    raw_spin_lock_irqsave(&iommu->register_lock, flag);
993    writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
994
995    /* Make sure hardware complete it */
996    IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
997              readl, (!(val & DMA_GSTS_WBFS)), val);
998
999    raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1000}
1001
1002/* return value determine if we need a write buffer flush */
1003static void __iommu_flush_context(struct intel_iommu *iommu,
1004                  u16 did, u16 source_id, u8 function_mask,
1005                  u64 type)
1006{
1007    u64 val = 0;
1008    unsigned long flag;
1009
1010    switch (type) {
1011    case DMA_CCMD_GLOBAL_INVL:
1012        val = DMA_CCMD_GLOBAL_INVL;
1013        break;
1014    case DMA_CCMD_DOMAIN_INVL:
1015        val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1016        break;
1017    case DMA_CCMD_DEVICE_INVL:
1018        val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1019            | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1020        break;
1021    default:
1022        BUG();
1023    }
1024    val |= DMA_CCMD_ICC;
1025
1026    raw_spin_lock_irqsave(&iommu->register_lock, flag);
1027    dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1028
1029    /* Make sure hardware complete it */
1030    IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1031        dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1032
1033    raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1034}
1035
1036/* return value determine if we need a write buffer flush */
1037static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1038                u64 addr, unsigned int size_order, u64 type)
1039{
1040    int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1041    u64 val = 0, val_iva = 0;
1042    unsigned long flag;
1043
1044    switch (type) {
1045    case DMA_TLB_GLOBAL_FLUSH:
1046        /* global flush doesn't need set IVA_REG */
1047        val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1048        break;
1049    case DMA_TLB_DSI_FLUSH:
1050        val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1051        break;
1052    case DMA_TLB_PSI_FLUSH:
1053        val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1054        /* Note: always flush non-leaf currently */
1055        val_iva = size_order | addr;
1056        break;
1057    default:
1058        BUG();
1059    }
1060    /* Note: set drain read/write */
1061#if 0
1062    /*
1063     * This is probably to be super secure.. Looks like we can
1064     * ignore it without any impact.
1065     */
1066    if (cap_read_drain(iommu->cap))
1067        val |= DMA_TLB_READ_DRAIN;
1068#endif
1069    if (cap_write_drain(iommu->cap))
1070        val |= DMA_TLB_WRITE_DRAIN;
1071
1072    raw_spin_lock_irqsave(&iommu->register_lock, flag);
1073    /* Note: Only uses first TLB reg currently */
1074    if (val_iva)
1075        dmar_writeq(iommu->reg + tlb_offset, val_iva);
1076    dmar_writeq(iommu->reg + tlb_offset + 8, val);
1077
1078    /* Make sure hardware complete it */
1079    IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1080        dmar_readq, (!(val & DMA_TLB_IVT)), val);
1081
1082    raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1083
1084    /* check IOTLB invalidation granularity */
1085    if (DMA_TLB_IAIG(val) == 0)
1086        printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
1087    if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1088        pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
1089            (unsigned long long)DMA_TLB_IIRG(type),
1090            (unsigned long long)DMA_TLB_IAIG(val));
1091}
1092
1093static struct device_domain_info *iommu_support_dev_iotlb(
1094    struct dmar_domain *domain, int segment, u8 bus, u8 devfn)
1095{
1096    int found = 0;
1097    unsigned long flags;
1098    struct device_domain_info *info;
1099    struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn);
1100
1101    if (!ecap_dev_iotlb_support(iommu->ecap))
1102        return NULL;
1103
1104    if (!iommu->qi)
1105        return NULL;
1106
1107    spin_lock_irqsave(&device_domain_lock, flags);
1108    list_for_each_entry(info, &domain->devices, link)
1109        if (info->bus == bus && info->devfn == devfn) {
1110            found = 1;
1111            break;
1112        }
1113    spin_unlock_irqrestore(&device_domain_lock, flags);
1114
1115    if (!found || !info->dev)
1116        return NULL;
1117
1118    if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS))
1119        return NULL;
1120
1121    if (!dmar_find_matched_atsr_unit(info->dev))
1122        return NULL;
1123
1124    info->iommu = iommu;
1125
1126    return info;
1127}
1128
1129static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1130{
1131    if (!info)
1132        return;
1133
1134    pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
1135}
1136
1137static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1138{
1139    if (!info->dev || !pci_ats_enabled(info->dev))
1140        return;
1141
1142    pci_disable_ats(info->dev);
1143}
1144
1145static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1146                  u64 addr, unsigned mask)
1147{
1148    u16 sid, qdep;
1149    unsigned long flags;
1150    struct device_domain_info *info;
1151
1152    spin_lock_irqsave(&device_domain_lock, flags);
1153    list_for_each_entry(info, &domain->devices, link) {
1154        if (!info->dev || !pci_ats_enabled(info->dev))
1155            continue;
1156
1157        sid = info->bus << 8 | info->devfn;
1158        qdep = pci_ats_queue_depth(info->dev);
1159        qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1160    }
1161    spin_unlock_irqrestore(&device_domain_lock, flags);
1162}
1163
1164static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1165                  unsigned long pfn, unsigned int pages, int map)
1166{
1167    unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1168    uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1169
1170    BUG_ON(pages == 0);
1171
1172    /*
1173     * Fallback to domain selective flush if no PSI support or the size is
1174     * too big.
1175     * PSI requires page size to be 2 ^ x, and the base address is naturally
1176     * aligned to the size
1177     */
1178    if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1179        iommu->flush.flush_iotlb(iommu, did, 0, 0,
1180                        DMA_TLB_DSI_FLUSH);
1181    else
1182        iommu->flush.flush_iotlb(iommu, did, addr, mask,
1183                        DMA_TLB_PSI_FLUSH);
1184
1185    /*
1186     * In caching mode, changes of pages from non-present to present require
1187     * flush. However, device IOTLB doesn't need to be flushed in this case.
1188     */
1189    if (!cap_caching_mode(iommu->cap) || !map)
1190        iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1191}
1192
1193static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1194{
1195    u32 pmen;
1196    unsigned long flags;
1197
1198    raw_spin_lock_irqsave(&iommu->register_lock, flags);
1199    pmen = readl(iommu->reg + DMAR_PMEN_REG);
1200    pmen &= ~DMA_PMEN_EPM;
1201    writel(pmen, iommu->reg + DMAR_PMEN_REG);
1202
1203    /* wait for the protected region status bit to clear */
1204    IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1205        readl, !(pmen & DMA_PMEN_PRS), pmen);
1206
1207    raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1208}
1209
1210static int iommu_enable_translation(struct intel_iommu *iommu)
1211{
1212    u32 sts;
1213    unsigned long flags;
1214
1215    raw_spin_lock_irqsave(&iommu->register_lock, flags);
1216    iommu->gcmd |= DMA_GCMD_TE;
1217    writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1218
1219    /* Make sure hardware complete it */
1220    IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1221              readl, (sts & DMA_GSTS_TES), sts);
1222
1223    raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1224    return 0;
1225}
1226
1227static int iommu_disable_translation(struct intel_iommu *iommu)
1228{
1229    u32 sts;
1230    unsigned long flag;
1231
1232    raw_spin_lock_irqsave(&iommu->register_lock, flag);
1233    iommu->gcmd &= ~DMA_GCMD_TE;
1234    writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1235
1236    /* Make sure hardware complete it */
1237    IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1238              readl, (!(sts & DMA_GSTS_TES)), sts);
1239
1240    raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1241    return 0;
1242}
1243
1244
1245static int iommu_init_domains(struct intel_iommu *iommu)
1246{
1247    unsigned long ndomains;
1248    unsigned long nlongs;
1249
1250    ndomains = cap_ndoms(iommu->cap);
1251    pr_debug("IOMMU %d: Number of Domains supported <%ld>\n", iommu->seq_id,
1252            ndomains);
1253    nlongs = BITS_TO_LONGS(ndomains);
1254
1255    spin_lock_init(&iommu->lock);
1256
1257    /* TBD: there might be 64K domains,
1258     * consider other allocation for future chip
1259     */
1260    iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1261    if (!iommu->domain_ids) {
1262        printk(KERN_ERR "Allocating domain id array failed\n");
1263        return -ENOMEM;
1264    }
1265    iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1266            GFP_KERNEL);
1267    if (!iommu->domains) {
1268        printk(KERN_ERR "Allocating domain array failed\n");
1269        return -ENOMEM;
1270    }
1271
1272    /*
1273     * if Caching mode is set, then invalid translations are tagged
1274     * with domainid 0. Hence we need to pre-allocate it.
1275     */
1276    if (cap_caching_mode(iommu->cap))
1277        set_bit(0, iommu->domain_ids);
1278    return 0;
1279}
1280
1281
1282static void domain_exit(struct dmar_domain *domain);
1283static void vm_domain_exit(struct dmar_domain *domain);
1284
1285void free_dmar_iommu(struct intel_iommu *iommu)
1286{
1287    struct dmar_domain *domain;
1288    int i;
1289    unsigned long flags;
1290
1291    if ((iommu->domains) && (iommu->domain_ids)) {
1292        for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1293            domain = iommu->domains[i];
1294            clear_bit(i, iommu->domain_ids);
1295
1296            spin_lock_irqsave(&domain->iommu_lock, flags);
1297            if (--domain->iommu_count == 0) {
1298                if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1299                    vm_domain_exit(domain);
1300                else
1301                    domain_exit(domain);
1302            }
1303            spin_unlock_irqrestore(&domain->iommu_lock, flags);
1304        }
1305    }
1306
1307    if (iommu->gcmd & DMA_GCMD_TE)
1308        iommu_disable_translation(iommu);
1309
1310    if (iommu->irq) {
1311        irq_set_handler_data(iommu->irq, NULL);
1312        /* This will mask the irq */
1313        free_irq(iommu->irq, iommu);
1314        destroy_irq(iommu->irq);
1315    }
1316
1317    kfree(iommu->domains);
1318    kfree(iommu->domain_ids);
1319
1320    g_iommus[iommu->seq_id] = NULL;
1321
1322    /* if all iommus are freed, free g_iommus */
1323    for (i = 0; i < g_num_of_iommus; i++) {
1324        if (g_iommus[i])
1325            break;
1326    }
1327
1328    if (i == g_num_of_iommus)
1329        kfree(g_iommus);
1330
1331    /* free context mapping */
1332    free_context_table(iommu);
1333}
1334
1335static struct dmar_domain *alloc_domain(void)
1336{
1337    struct dmar_domain *domain;
1338
1339    domain = alloc_domain_mem();
1340    if (!domain)
1341        return NULL;
1342
1343    domain->nid = -1;
1344    memset(domain->iommu_bmp, 0, sizeof(domain->iommu_bmp));
1345    domain->flags = 0;
1346
1347    return domain;
1348}
1349
1350static int iommu_attach_domain(struct dmar_domain *domain,
1351                   struct intel_iommu *iommu)
1352{
1353    int num;
1354    unsigned long ndomains;
1355    unsigned long flags;
1356
1357    ndomains = cap_ndoms(iommu->cap);
1358
1359    spin_lock_irqsave(&iommu->lock, flags);
1360
1361    num = find_first_zero_bit(iommu->domain_ids, ndomains);
1362    if (num >= ndomains) {
1363        spin_unlock_irqrestore(&iommu->lock, flags);
1364        printk(KERN_ERR "IOMMU: no free domain ids\n");
1365        return -ENOMEM;
1366    }
1367
1368    domain->id = num;
1369    set_bit(num, iommu->domain_ids);
1370    set_bit(iommu->seq_id, domain->iommu_bmp);
1371    iommu->domains[num] = domain;
1372    spin_unlock_irqrestore(&iommu->lock, flags);
1373
1374    return 0;
1375}
1376
1377static void iommu_detach_domain(struct dmar_domain *domain,
1378                struct intel_iommu *iommu)
1379{
1380    unsigned long flags;
1381    int num, ndomains;
1382    int found = 0;
1383
1384    spin_lock_irqsave(&iommu->lock, flags);
1385    ndomains = cap_ndoms(iommu->cap);
1386    for_each_set_bit(num, iommu->domain_ids, ndomains) {
1387        if (iommu->domains[num] == domain) {
1388            found = 1;
1389            break;
1390        }
1391    }
1392
1393    if (found) {
1394        clear_bit(num, iommu->domain_ids);
1395        clear_bit(iommu->seq_id, domain->iommu_bmp);
1396        iommu->domains[num] = NULL;
1397    }
1398    spin_unlock_irqrestore(&iommu->lock, flags);
1399}
1400
1401static struct iova_domain reserved_iova_list;
1402static struct lock_class_key reserved_rbtree_key;
1403
1404static int dmar_init_reserved_ranges(void)
1405{
1406    struct pci_dev *pdev = NULL;
1407    struct iova *iova;
1408    int i;
1409
1410    init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1411
1412    lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1413        &reserved_rbtree_key);
1414
1415    /* IOAPIC ranges shouldn't be accessed by DMA */
1416    iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1417        IOVA_PFN(IOAPIC_RANGE_END));
1418    if (!iova) {
1419        printk(KERN_ERR "Reserve IOAPIC range failed\n");
1420        return -ENODEV;
1421    }
1422
1423    /* Reserve all PCI MMIO to avoid peer-to-peer access */
1424    for_each_pci_dev(pdev) {
1425        struct resource *r;
1426
1427        for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1428            r = &pdev->resource[i];
1429            if (!r->flags || !(r->flags & IORESOURCE_MEM))
1430                continue;
1431            iova = reserve_iova(&reserved_iova_list,
1432                        IOVA_PFN(r->start),
1433                        IOVA_PFN(r->end));
1434            if (!iova) {
1435                printk(KERN_ERR "Reserve iova failed\n");
1436                return -ENODEV;
1437            }
1438        }
1439    }
1440    return 0;
1441}
1442
1443static void domain_reserve_special_ranges(struct dmar_domain *domain)
1444{
1445    copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1446}
1447
1448static inline int guestwidth_to_adjustwidth(int gaw)
1449{
1450    int agaw;
1451    int r = (gaw - 12) % 9;
1452
1453    if (r == 0)
1454        agaw = gaw;
1455    else
1456        agaw = gaw + 9 - r;
1457    if (agaw > 64)
1458        agaw = 64;
1459    return agaw;
1460}
1461
1462static int domain_init(struct dmar_domain *domain, int guest_width)
1463{
1464    struct intel_iommu *iommu;
1465    int adjust_width, agaw;
1466    unsigned long sagaw;
1467
1468    init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1469    spin_lock_init(&domain->iommu_lock);
1470
1471    domain_reserve_special_ranges(domain);
1472
1473    /* calculate AGAW */
1474    iommu = domain_get_iommu(domain);
1475    if (guest_width > cap_mgaw(iommu->cap))
1476        guest_width = cap_mgaw(iommu->cap);
1477    domain->gaw = guest_width;
1478    adjust_width = guestwidth_to_adjustwidth(guest_width);
1479    agaw = width_to_agaw(adjust_width);
1480    sagaw = cap_sagaw(iommu->cap);
1481    if (!test_bit(agaw, &sagaw)) {
1482        /* hardware doesn't support it, choose a bigger one */
1483        pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1484        agaw = find_next_bit(&sagaw, 5, agaw);
1485        if (agaw >= 5)
1486            return -ENODEV;
1487    }
1488    domain->agaw = agaw;
1489    INIT_LIST_HEAD(&domain->devices);
1490
1491    if (ecap_coherent(iommu->ecap))
1492        domain->iommu_coherency = 1;
1493    else
1494        domain->iommu_coherency = 0;
1495
1496    if (ecap_sc_support(iommu->ecap))
1497        domain->iommu_snooping = 1;
1498    else
1499        domain->iommu_snooping = 0;
1500
1501    domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1502    domain->iommu_count = 1;
1503    domain->nid = iommu->node;
1504
1505    /* always allocate the top pgd */
1506    domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1507    if (!domain->pgd)
1508        return -ENOMEM;
1509    __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1510    return 0;
1511}
1512
1513static void domain_exit(struct dmar_domain *domain)
1514{
1515    struct dmar_drhd_unit *drhd;
1516    struct intel_iommu *iommu;
1517
1518    /* Domain 0 is reserved, so dont process it */
1519    if (!domain)
1520        return;
1521
1522    /* Flush any lazy unmaps that may reference this domain */
1523    if (!intel_iommu_strict)
1524        flush_unmaps_timeout(0);
1525
1526    domain_remove_dev_info(domain);
1527    /* destroy iovas */
1528    put_iova_domain(&domain->iovad);
1529
1530    /* clear ptes */
1531    dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1532
1533    /* free page tables */
1534    dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1535
1536    for_each_active_iommu(iommu, drhd)
1537        if (test_bit(iommu->seq_id, domain->iommu_bmp))
1538            iommu_detach_domain(domain, iommu);
1539
1540    free_domain_mem(domain);
1541}
1542
1543static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
1544                 u8 bus, u8 devfn, int translation)
1545{
1546    struct context_entry *context;
1547    unsigned long flags;
1548    struct intel_iommu *iommu;
1549    struct dma_pte *pgd;
1550    unsigned long num;
1551    unsigned long ndomains;
1552    int id;
1553    int agaw;
1554    struct device_domain_info *info = NULL;
1555
1556    pr_debug("Set context mapping for %02x:%02x.%d\n",
1557        bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1558
1559    BUG_ON(!domain->pgd);
1560    BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1561           translation != CONTEXT_TT_MULTI_LEVEL);
1562
1563    iommu = device_to_iommu(segment, bus, devfn);
1564    if (!iommu)
1565        return -ENODEV;
1566
1567    context = device_to_context_entry(iommu, bus, devfn);
1568    if (!context)
1569        return -ENOMEM;
1570    spin_lock_irqsave(&iommu->lock, flags);
1571    if (context_present(context)) {
1572        spin_unlock_irqrestore(&iommu->lock, flags);
1573        return 0;
1574    }
1575
1576    id = domain->id;
1577    pgd = domain->pgd;
1578
1579    if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1580        domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
1581        int found = 0;
1582
1583        /* find an available domain id for this device in iommu */
1584        ndomains = cap_ndoms(iommu->cap);
1585        for_each_set_bit(num, iommu->domain_ids, ndomains) {
1586            if (iommu->domains[num] == domain) {
1587                id = num;
1588                found = 1;
1589                break;
1590            }
1591        }
1592
1593        if (found == 0) {
1594            num = find_first_zero_bit(iommu->domain_ids, ndomains);
1595            if (num >= ndomains) {
1596                spin_unlock_irqrestore(&iommu->lock, flags);
1597                printk(KERN_ERR "IOMMU: no free domain ids\n");
1598                return -EFAULT;
1599            }
1600
1601            set_bit(num, iommu->domain_ids);
1602            iommu->domains[num] = domain;
1603            id = num;
1604        }
1605
1606        /* Skip top levels of page tables for
1607         * iommu which has less agaw than default.
1608         * Unnecessary for PT mode.
1609         */
1610        if (translation != CONTEXT_TT_PASS_THROUGH) {
1611            for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1612                pgd = phys_to_virt(dma_pte_addr(pgd));
1613                if (!dma_pte_present(pgd)) {
1614                    spin_unlock_irqrestore(&iommu->lock, flags);
1615                    return -ENOMEM;
1616                }
1617            }
1618        }
1619    }
1620
1621    context_set_domain_id(context, id);
1622
1623    if (translation != CONTEXT_TT_PASS_THROUGH) {
1624        info = iommu_support_dev_iotlb(domain, segment, bus, devfn);
1625        translation = info ? CONTEXT_TT_DEV_IOTLB :
1626                     CONTEXT_TT_MULTI_LEVEL;
1627    }
1628    /*
1629     * In pass through mode, AW must be programmed to indicate the largest
1630     * AGAW value supported by hardware. And ASR is ignored by hardware.
1631     */
1632    if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1633        context_set_address_width(context, iommu->msagaw);
1634    else {
1635        context_set_address_root(context, virt_to_phys(pgd));
1636        context_set_address_width(context, iommu->agaw);
1637    }
1638
1639    context_set_translation_type(context, translation);
1640    context_set_fault_enable(context);
1641    context_set_present(context);
1642    domain_flush_cache(domain, context, sizeof(*context));
1643
1644    /*
1645     * It's a non-present to present mapping. If hardware doesn't cache
1646     * non-present entry we only need to flush the write-buffer. If the
1647     * _does_ cache non-present entries, then it does so in the special
1648     * domain #0, which we have to flush:
1649     */
1650    if (cap_caching_mode(iommu->cap)) {
1651        iommu->flush.flush_context(iommu, 0,
1652                       (((u16)bus) << 8) | devfn,
1653                       DMA_CCMD_MASK_NOBIT,
1654                       DMA_CCMD_DEVICE_INVL);
1655        iommu->flush.flush_iotlb(iommu, domain->id, 0, 0, DMA_TLB_DSI_FLUSH);
1656    } else {
1657        iommu_flush_write_buffer(iommu);
1658    }
1659    iommu_enable_dev_iotlb(info);
1660    spin_unlock_irqrestore(&iommu->lock, flags);
1661
1662    spin_lock_irqsave(&domain->iommu_lock, flags);
1663    if (!test_and_set_bit(iommu->seq_id, domain->iommu_bmp)) {
1664        domain->iommu_count++;
1665        if (domain->iommu_count == 1)
1666            domain->nid = iommu->node;
1667        domain_update_iommu_cap(domain);
1668    }
1669    spin_unlock_irqrestore(&domain->iommu_lock, flags);
1670    return 0;
1671}
1672
1673static int
1674domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
1675            int translation)
1676{
1677    int ret;
1678    struct pci_dev *tmp, *parent;
1679
1680    ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
1681                     pdev->bus->number, pdev->devfn,
1682                     translation);
1683    if (ret)
1684        return ret;
1685
1686    /* dependent device mapping */
1687    tmp = pci_find_upstream_pcie_bridge(pdev);
1688    if (!tmp)
1689        return 0;
1690    /* Secondary interface's bus number and devfn 0 */
1691    parent = pdev->bus->self;
1692    while (parent != tmp) {
1693        ret = domain_context_mapping_one(domain,
1694                         pci_domain_nr(parent->bus),
1695                         parent->bus->number,
1696                         parent->devfn, translation);
1697        if (ret)
1698            return ret;
1699        parent = parent->bus->self;
1700    }
1701    if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
1702        return domain_context_mapping_one(domain,
1703                    pci_domain_nr(tmp->subordinate),
1704                    tmp->subordinate->number, 0,
1705                    translation);
1706    else /* this is a legacy PCI bridge */
1707        return domain_context_mapping_one(domain,
1708                          pci_domain_nr(tmp->bus),
1709                          tmp->bus->number,
1710                          tmp->devfn,
1711                          translation);
1712}
1713
1714static int domain_context_mapped(struct pci_dev *pdev)
1715{
1716    int ret;
1717    struct pci_dev *tmp, *parent;
1718    struct intel_iommu *iommu;
1719
1720    iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1721                pdev->devfn);
1722    if (!iommu)
1723        return -ENODEV;
1724
1725    ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
1726    if (!ret)
1727        return ret;
1728    /* dependent device mapping */
1729    tmp = pci_find_upstream_pcie_bridge(pdev);
1730    if (!tmp)
1731        return ret;
1732    /* Secondary interface's bus number and devfn 0 */
1733    parent = pdev->bus->self;
1734    while (parent != tmp) {
1735        ret = device_context_mapped(iommu, parent->bus->number,
1736                        parent->devfn);
1737        if (!ret)
1738            return ret;
1739        parent = parent->bus->self;
1740    }
1741    if (pci_is_pcie(tmp))
1742        return device_context_mapped(iommu, tmp->subordinate->number,
1743                         0);
1744    else
1745        return device_context_mapped(iommu, tmp->bus->number,
1746                         tmp->devfn);
1747}
1748
1749/* Returns a number of VTD pages, but aligned to MM page size */
1750static inline unsigned long aligned_nrpages(unsigned long host_addr,
1751                        size_t size)
1752{
1753    host_addr &= ~PAGE_MASK;
1754    return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1755}
1756
1757/* Return largest possible superpage level for a given mapping */
1758static inline int hardware_largepage_caps(struct dmar_domain *domain,
1759                      unsigned long iov_pfn,
1760                      unsigned long phy_pfn,
1761                      unsigned long pages)
1762{
1763    int support, level = 1;
1764    unsigned long pfnmerge;
1765
1766    support = domain->iommu_superpage;
1767
1768    /* To use a large page, the virtual *and* physical addresses
1769       must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1770       of them will mean we have to use smaller pages. So just
1771       merge them and check both at once. */
1772    pfnmerge = iov_pfn | phy_pfn;
1773
1774    while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1775        pages >>= VTD_STRIDE_SHIFT;
1776        if (!pages)
1777            break;
1778        pfnmerge >>= VTD_STRIDE_SHIFT;
1779        level++;
1780        support--;
1781    }
1782    return level;
1783}
1784
1785static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1786                struct scatterlist *sg, unsigned long phys_pfn,
1787                unsigned long nr_pages, int prot)
1788{
1789    struct dma_pte *first_pte = NULL, *pte = NULL;
1790    phys_addr_t uninitialized_var(pteval);
1791    int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1792    unsigned long sg_res;
1793    unsigned int largepage_lvl = 0;
1794    unsigned long lvl_pages = 0;
1795
1796    BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1797
1798    if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1799        return -EINVAL;
1800
1801    prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
1802
1803    if (sg)
1804        sg_res = 0;
1805    else {
1806        sg_res = nr_pages + 1;
1807        pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1808    }
1809
1810    while (nr_pages > 0) {
1811        uint64_t tmp;
1812
1813        if (!sg_res) {
1814            sg_res = aligned_nrpages(sg->offset, sg->length);
1815            sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
1816            sg->dma_length = sg->length;
1817            pteval = page_to_phys(sg_page(sg)) | prot;
1818            phys_pfn = pteval >> VTD_PAGE_SHIFT;
1819        }
1820
1821        if (!pte) {
1822            largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
1823
1824            first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, largepage_lvl);
1825            if (!pte)
1826                return -ENOMEM;
1827            /* It is large page*/
1828            if (largepage_lvl > 1)
1829                pteval |= DMA_PTE_LARGE_PAGE;
1830            else
1831                pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
1832
1833        }
1834        /* We don't need lock here, nobody else
1835         * touches the iova range
1836         */
1837        tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
1838        if (tmp) {
1839            static int dumps = 5;
1840            printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1841                   iov_pfn, tmp, (unsigned long long)pteval);
1842            if (dumps) {
1843                dumps--;
1844                debug_dma_dump_mappings(NULL);
1845            }
1846            WARN_ON(1);
1847        }
1848
1849        lvl_pages = lvl_to_nr_pages(largepage_lvl);
1850
1851        BUG_ON(nr_pages < lvl_pages);
1852        BUG_ON(sg_res < lvl_pages);
1853
1854        nr_pages -= lvl_pages;
1855        iov_pfn += lvl_pages;
1856        phys_pfn += lvl_pages;
1857        pteval += lvl_pages * VTD_PAGE_SIZE;
1858        sg_res -= lvl_pages;
1859
1860        /* If the next PTE would be the first in a new page, then we
1861           need to flush the cache on the entries we've just written.
1862           And then we'll need to recalculate 'pte', so clear it and
1863           let it get set again in the if (!pte) block above.
1864
1865           If we're done (!nr_pages) we need to flush the cache too.
1866
1867           Also if we've been setting superpages, we may need to
1868           recalculate 'pte' and switch back to smaller pages for the
1869           end of the mapping, if the trailing size is not enough to
1870           use another superpage (i.e. sg_res < lvl_pages). */
1871        pte++;
1872        if (!nr_pages || first_pte_in_page(pte) ||
1873            (largepage_lvl > 1 && sg_res < lvl_pages)) {
1874            domain_flush_cache(domain, first_pte,
1875                       (void *)pte - (void *)first_pte);
1876            pte = NULL;
1877        }
1878
1879        if (!sg_res && nr_pages)
1880            sg = sg_next(sg);
1881    }
1882    return 0;
1883}
1884
1885static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1886                    struct scatterlist *sg, unsigned long nr_pages,
1887                    int prot)
1888{
1889    return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
1890}
1891
1892static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1893                     unsigned long phys_pfn, unsigned long nr_pages,
1894                     int prot)
1895{
1896    return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
1897}
1898
1899static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1900{
1901    if (!iommu)
1902        return;
1903
1904    clear_context_table(iommu, bus, devfn);
1905    iommu->flush.flush_context(iommu, 0, 0, 0,
1906                       DMA_CCMD_GLOBAL_INVL);
1907    iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1908}
1909
1910static inline void unlink_domain_info(struct device_domain_info *info)
1911{
1912    assert_spin_locked(&device_domain_lock);
1913    list_del(&info->link);
1914    list_del(&info->global);
1915    if (info->dev)
1916        info->dev->dev.archdata.iommu = NULL;
1917}
1918
1919static void domain_remove_dev_info(struct dmar_domain *domain)
1920{
1921    struct device_domain_info *info;
1922    unsigned long flags;
1923    struct intel_iommu *iommu;
1924
1925    spin_lock_irqsave(&device_domain_lock, flags);
1926    while (!list_empty(&domain->devices)) {
1927        info = list_entry(domain->devices.next,
1928            struct device_domain_info, link);
1929        unlink_domain_info(info);
1930        spin_unlock_irqrestore(&device_domain_lock, flags);
1931
1932        iommu_disable_dev_iotlb(info);
1933        iommu = device_to_iommu(info->segment, info->bus, info->devfn);
1934        iommu_detach_dev(iommu, info->bus, info->devfn);
1935        free_devinfo_mem(info);
1936
1937        spin_lock_irqsave(&device_domain_lock, flags);
1938    }
1939    spin_unlock_irqrestore(&device_domain_lock, flags);
1940}
1941
1942/*
1943 * find_domain
1944 * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1945 */
1946static struct dmar_domain *
1947find_domain(struct pci_dev *pdev)
1948{
1949    struct device_domain_info *info;
1950
1951    /* No lock here, assumes no domain exit in normal case */
1952    info = pdev->dev.archdata.iommu;
1953    if (info)
1954        return info->domain;
1955    return NULL;
1956}
1957
1958/* domain is initialized */
1959static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1960{
1961    struct dmar_domain *domain, *found = NULL;
1962    struct intel_iommu *iommu;
1963    struct dmar_drhd_unit *drhd;
1964    struct device_domain_info *info, *tmp;
1965    struct pci_dev *dev_tmp;
1966    unsigned long flags;
1967    int bus = 0, devfn = 0;
1968    int segment;
1969    int ret;
1970
1971    domain = find_domain(pdev);
1972    if (domain)
1973        return domain;
1974
1975    segment = pci_domain_nr(pdev->bus);
1976
1977    dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1978    if (dev_tmp) {
1979        if (pci_is_pcie(dev_tmp)) {
1980            bus = dev_tmp->subordinate->number;
1981            devfn = 0;
1982        } else {
1983            bus = dev_tmp->bus->number;
1984            devfn = dev_tmp->devfn;
1985        }
1986        spin_lock_irqsave(&device_domain_lock, flags);
1987        list_for_each_entry(info, &device_domain_list, global) {
1988            if (info->segment == segment &&
1989                info->bus == bus && info->devfn == devfn) {
1990                found = info->domain;
1991                break;
1992            }
1993        }
1994        spin_unlock_irqrestore(&device_domain_lock, flags);
1995        /* pcie-pci bridge already has a domain, uses it */
1996        if (found) {
1997            domain = found;
1998            goto found_domain;
1999        }
2000    }
2001
2002    domain = alloc_domain();
2003    if (!domain)
2004        goto error;
2005
2006    /* Allocate new domain for the device */
2007    drhd = dmar_find_matched_drhd_unit(pdev);
2008    if (!drhd) {
2009        printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
2010            pci_name(pdev));
2011        free_domain_mem(domain);
2012        return NULL;
2013    }
2014    iommu = drhd->iommu;
2015
2016    ret = iommu_attach_domain(domain, iommu);
2017    if (ret) {
2018        free_domain_mem(domain);
2019        goto error;
2020    }
2021
2022    if (domain_init(domain, gaw)) {
2023        domain_exit(domain);
2024        goto error;
2025    }
2026
2027    /* register pcie-to-pci device */
2028    if (dev_tmp) {
2029        info = alloc_devinfo_mem();
2030        if (!info) {
2031            domain_exit(domain);
2032            goto error;
2033        }
2034        info->segment = segment;
2035        info->bus = bus;
2036        info->devfn = devfn;
2037        info->dev = NULL;
2038        info->domain = domain;
2039        /* This domain is shared by devices under p2p bridge */
2040        domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
2041
2042        /* pcie-to-pci bridge already has a domain, uses it */
2043        found = NULL;
2044        spin_lock_irqsave(&device_domain_lock, flags);
2045        list_for_each_entry(tmp, &device_domain_list, global) {
2046            if (tmp->segment == segment &&
2047                tmp->bus == bus && tmp->devfn == devfn) {
2048                found = tmp->domain;
2049                break;
2050            }
2051        }
2052        if (found) {
2053            spin_unlock_irqrestore(&device_domain_lock, flags);
2054            free_devinfo_mem(info);
2055            domain_exit(domain);
2056            domain = found;
2057        } else {
2058            list_add(&info->link, &domain->devices);
2059            list_add(&info->global, &device_domain_list);
2060            spin_unlock_irqrestore(&device_domain_lock, flags);
2061        }
2062    }
2063
2064found_domain:
2065    info = alloc_devinfo_mem();
2066    if (!info)
2067        goto error;
2068    info->segment = segment;
2069    info->bus = pdev->bus->number;
2070    info->devfn = pdev->devfn;
2071    info->dev = pdev;
2072    info->domain = domain;
2073    spin_lock_irqsave(&device_domain_lock, flags);
2074    /* somebody is fast */
2075    found = find_domain(pdev);
2076    if (found != NULL) {
2077        spin_unlock_irqrestore(&device_domain_lock, flags);
2078        if (found != domain) {
2079            domain_exit(domain);
2080            domain = found;
2081        }
2082        free_devinfo_mem(info);
2083        return domain;
2084    }
2085    list_add(&info->link, &domain->devices);
2086    list_add(&info->global, &device_domain_list);
2087    pdev->dev.archdata.iommu = info;
2088    spin_unlock_irqrestore(&device_domain_lock, flags);
2089    return domain;
2090error:
2091    /* recheck it here, maybe others set it */
2092    return find_domain(pdev);
2093}
2094
2095static int iommu_identity_mapping;
2096#define IDENTMAP_ALL 1
2097#define IDENTMAP_GFX 2
2098#define IDENTMAP_AZALIA 4
2099
2100static int iommu_domain_identity_map(struct dmar_domain *domain,
2101                     unsigned long long start,
2102                     unsigned long long end)
2103{
2104    unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2105    unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2106
2107    if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2108              dma_to_mm_pfn(last_vpfn))) {
2109        printk(KERN_ERR "IOMMU: reserve iova failed\n");
2110        return -ENOMEM;
2111    }
2112
2113    pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
2114         start, end, domain->id);
2115    /*
2116     * RMRR range might have overlap with physical memory range,
2117     * clear it first
2118     */
2119    dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2120
2121    return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2122                  last_vpfn - first_vpfn + 1,
2123                  DMA_PTE_READ|DMA_PTE_WRITE);
2124}
2125
2126static int iommu_prepare_identity_map(struct pci_dev *pdev,
2127                      unsigned long long start,
2128                      unsigned long long end)
2129{
2130    struct dmar_domain *domain;
2131    int ret;
2132
2133    domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2134    if (!domain)
2135        return -ENOMEM;
2136
2137    /* For _hardware_ passthrough, don't bother. But for software
2138       passthrough, we do it anyway -- it may indicate a memory
2139       range which is reserved in E820, so which didn't get set
2140       up to start with in si_domain */
2141    if (domain == si_domain && hw_pass_through) {
2142        printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2143               pci_name(pdev), start, end);
2144        return 0;
2145    }
2146
2147    printk(KERN_INFO
2148           "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2149           pci_name(pdev), start, end);
2150    
2151    if (end < start) {
2152        WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2153            "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2154            dmi_get_system_info(DMI_BIOS_VENDOR),
2155            dmi_get_system_info(DMI_BIOS_VERSION),
2156             dmi_get_system_info(DMI_PRODUCT_VERSION));
2157        ret = -EIO;
2158        goto error;
2159    }
2160
2161    if (end >> agaw_to_width(domain->agaw)) {
2162        WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2163             "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2164             agaw_to_width(domain->agaw),
2165             dmi_get_system_info(DMI_BIOS_VENDOR),
2166             dmi_get_system_info(DMI_BIOS_VERSION),
2167             dmi_get_system_info(DMI_PRODUCT_VERSION));
2168        ret = -EIO;
2169        goto error;
2170    }
2171
2172    ret = iommu_domain_identity_map(domain, start, end);
2173    if (ret)
2174        goto error;
2175
2176    /* context entry init */
2177    ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
2178    if (ret)
2179        goto error;
2180
2181    return 0;
2182
2183 error:
2184    domain_exit(domain);
2185    return ret;
2186}
2187
2188static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2189    struct pci_dev *pdev)
2190{
2191    if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2192        return 0;
2193    return iommu_prepare_identity_map(pdev, rmrr->base_address,
2194        rmrr->end_address);
2195}
2196
2197#ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2198static inline void iommu_prepare_isa(void)
2199{
2200    struct pci_dev *pdev;
2201    int ret;
2202
2203    pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2204    if (!pdev)
2205        return;
2206
2207    printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2208    ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024 - 1);
2209
2210    if (ret)
2211        printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2212               "floppy might not work\n");
2213
2214}
2215#else
2216static inline void iommu_prepare_isa(void)
2217{
2218    return;
2219}
2220#endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2221
2222static int md_domain_init(struct dmar_domain *domain, int guest_width);
2223
2224static int __init si_domain_init(int hw)
2225{
2226    struct dmar_drhd_unit *drhd;
2227    struct intel_iommu *iommu;
2228    int nid, ret = 0;
2229
2230    si_domain = alloc_domain();
2231    if (!si_domain)
2232        return -EFAULT;
2233
2234    pr_debug("Identity mapping domain is domain %d\n", si_domain->id);
2235
2236    for_each_active_iommu(iommu, drhd) {
2237        ret = iommu_attach_domain(si_domain, iommu);
2238        if (ret) {
2239            domain_exit(si_domain);
2240            return -EFAULT;
2241        }
2242    }
2243
2244    if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2245        domain_exit(si_domain);
2246        return -EFAULT;
2247    }
2248
2249    si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
2250
2251    if (hw)
2252        return 0;
2253
2254    for_each_online_node(nid) {
2255        unsigned long start_pfn, end_pfn;
2256        int i;
2257
2258        for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2259            ret = iommu_domain_identity_map(si_domain,
2260                    PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2261            if (ret)
2262                return ret;
2263        }
2264    }
2265
2266    return 0;
2267}
2268
2269static void domain_remove_one_dev_info(struct dmar_domain *domain,
2270                      struct pci_dev *pdev);
2271static int identity_mapping(struct pci_dev *pdev)
2272{
2273    struct device_domain_info *info;
2274
2275    if (likely(!iommu_identity_mapping))
2276        return 0;
2277
2278    info = pdev->dev.archdata.iommu;
2279    if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2280        return (info->domain == si_domain);
2281
2282    return 0;
2283}
2284
2285static int domain_add_dev_info(struct dmar_domain *domain,
2286                   struct pci_dev *pdev,
2287                   int translation)
2288{
2289    struct device_domain_info *info;
2290    unsigned long flags;
2291    int ret;
2292
2293    info = alloc_devinfo_mem();
2294    if (!info)
2295        return -ENOMEM;
2296
2297    info->segment = pci_domain_nr(pdev->bus);
2298    info->bus = pdev->bus->number;
2299    info->devfn = pdev->devfn;
2300    info->dev = pdev;
2301    info->domain = domain;
2302
2303    spin_lock_irqsave(&device_domain_lock, flags);
2304    list_add(&info->link, &domain->devices);
2305    list_add(&info->global, &device_domain_list);
2306    pdev->dev.archdata.iommu = info;
2307    spin_unlock_irqrestore(&device_domain_lock, flags);
2308
2309    ret = domain_context_mapping(domain, pdev, translation);
2310    if (ret) {
2311        spin_lock_irqsave(&device_domain_lock, flags);
2312        unlink_domain_info(info);
2313        spin_unlock_irqrestore(&device_domain_lock, flags);
2314        free_devinfo_mem(info);
2315        return ret;
2316    }
2317
2318    return 0;
2319}
2320
2321static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
2322{
2323    if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2324        return 1;
2325
2326    if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2327        return 1;
2328
2329    if (!(iommu_identity_mapping & IDENTMAP_ALL))
2330        return 0;
2331
2332    /*
2333     * We want to start off with all devices in the 1:1 domain, and
2334     * take them out later if we find they can't access all of memory.
2335     *
2336     * However, we can't do this for PCI devices behind bridges,
2337     * because all PCI devices behind the same bridge will end up
2338     * with the same source-id on their transactions.
2339     *
2340     * Practically speaking, we can't change things around for these
2341     * devices at run-time, because we can't be sure there'll be no
2342     * DMA transactions in flight for any of their siblings.
2343     *
2344     * So PCI devices (unless they're on the root bus) as well as
2345     * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2346     * the 1:1 domain, just in _case_ one of their siblings turns out
2347     * not to be able to map all of memory.
2348     */
2349    if (!pci_is_pcie(pdev)) {
2350        if (!pci_is_root_bus(pdev->bus))
2351            return 0;
2352        if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2353            return 0;
2354    } else if (pdev->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
2355        return 0;
2356
2357    /*
2358     * At boot time, we don't yet know if devices will be 64-bit capable.
2359     * Assume that they will -- if they turn out not to be, then we can
2360     * take them out of the 1:1 domain later.
2361     */
2362    if (!startup) {
2363        /*
2364         * If the device's dma_mask is less than the system's memory
2365         * size then this is not a candidate for identity mapping.
2366         */
2367        u64 dma_mask = pdev->dma_mask;
2368
2369        if (pdev->dev.coherent_dma_mask &&
2370            pdev->dev.coherent_dma_mask < dma_mask)
2371            dma_mask = pdev->dev.coherent_dma_mask;
2372
2373        return dma_mask >= dma_get_required_mask(&pdev->dev);
2374    }
2375
2376    return 1;
2377}
2378
2379static int __init iommu_prepare_static_identity_mapping(int hw)
2380{
2381    struct pci_dev *pdev = NULL;
2382    int ret;
2383
2384    ret = si_domain_init(hw);
2385    if (ret)
2386        return -EFAULT;
2387
2388    for_each_pci_dev(pdev) {
2389        if (iommu_should_identity_map(pdev, 1)) {
2390            ret = domain_add_dev_info(si_domain, pdev,
2391                         hw ? CONTEXT_TT_PASS_THROUGH :
2392                          CONTEXT_TT_MULTI_LEVEL);
2393            if (ret) {
2394                /* device not associated with an iommu */
2395                if (ret == -ENODEV)
2396                    continue;
2397                return ret;
2398            }
2399            pr_info("IOMMU: %s identity mapping for device %s\n",
2400                hw ? "hardware" : "software", pci_name(pdev));
2401        }
2402    }
2403
2404    return 0;
2405}
2406
2407static int __init init_dmars(void)
2408{
2409    struct dmar_drhd_unit *drhd;
2410    struct dmar_rmrr_unit *rmrr;
2411    struct pci_dev *pdev;
2412    struct intel_iommu *iommu;
2413    int i, ret;
2414
2415    /*
2416     * for each drhd
2417     * allocate root
2418     * initialize and program root entry to not present
2419     * endfor
2420     */
2421    for_each_drhd_unit(drhd) {
2422        /*
2423         * lock not needed as this is only incremented in the single
2424         * threaded kernel __init code path all other access are read
2425         * only
2426         */
2427        if (g_num_of_iommus < IOMMU_UNITS_SUPPORTED) {
2428            g_num_of_iommus++;
2429            continue;
2430        }
2431        printk_once(KERN_ERR "intel-iommu: exceeded %d IOMMUs\n",
2432              IOMMU_UNITS_SUPPORTED);
2433    }
2434
2435    g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2436            GFP_KERNEL);
2437    if (!g_iommus) {
2438        printk(KERN_ERR "Allocating global iommu array failed\n");
2439        ret = -ENOMEM;
2440        goto error;
2441    }
2442
2443    deferred_flush = kzalloc(g_num_of_iommus *
2444        sizeof(struct deferred_flush_tables), GFP_KERNEL);
2445    if (!deferred_flush) {
2446        ret = -ENOMEM;
2447        goto error;
2448    }
2449
2450    for_each_drhd_unit(drhd) {
2451        if (drhd->ignored)
2452            continue;
2453
2454        iommu = drhd->iommu;
2455        g_iommus[iommu->seq_id] = iommu;
2456
2457        ret = iommu_init_domains(iommu);
2458        if (ret)
2459            goto error;
2460
2461        /*
2462         * TBD:
2463         * we could share the same root & context tables
2464         * among all IOMMU's. Need to Split it later.
2465         */
2466        ret = iommu_alloc_root_entry(iommu);
2467        if (ret) {
2468            printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2469            goto error;
2470        }
2471        if (!ecap_pass_through(iommu->ecap))
2472            hw_pass_through = 0;
2473    }
2474
2475    /*
2476     * Start from the sane iommu hardware state.
2477     */
2478    for_each_drhd_unit(drhd) {
2479        if (drhd->ignored)
2480            continue;
2481
2482        iommu = drhd->iommu;
2483
2484        /*
2485         * If the queued invalidation is already initialized by us
2486         * (for example, while enabling interrupt-remapping) then
2487         * we got the things already rolling from a sane state.
2488         */
2489        if (iommu->qi)
2490            continue;
2491
2492        /*
2493         * Clear any previous faults.
2494         */
2495        dmar_fault(-1, iommu);
2496        /*
2497         * Disable queued invalidation if supported and already enabled
2498         * before OS handover.
2499         */
2500        dmar_disable_qi(iommu);
2501    }
2502
2503    for_each_drhd_unit(drhd) {
2504        if (drhd->ignored)
2505            continue;
2506
2507        iommu = drhd->iommu;
2508
2509        if (dmar_enable_qi(iommu)) {
2510            /*
2511             * Queued Invalidate not enabled, use Register Based
2512             * Invalidate
2513             */
2514            iommu->flush.flush_context = __iommu_flush_context;
2515            iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2516            printk(KERN_INFO "IOMMU %d 0x%Lx: using Register based "
2517                   "invalidation\n",
2518                iommu->seq_id,
2519                   (unsigned long long)drhd->reg_base_addr);
2520        } else {
2521            iommu->flush.flush_context = qi_flush_context;
2522            iommu->flush.flush_iotlb = qi_flush_iotlb;
2523            printk(KERN_INFO "IOMMU %d 0x%Lx: using Queued "
2524                   "invalidation\n",
2525                iommu->seq_id,
2526                   (unsigned long long)drhd->reg_base_addr);
2527        }
2528    }
2529
2530    if (iommu_pass_through)
2531        iommu_identity_mapping |= IDENTMAP_ALL;
2532
2533#ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2534    iommu_identity_mapping |= IDENTMAP_GFX;
2535#endif
2536
2537    check_tylersburg_isoch();
2538
2539    /*
2540     * If pass through is not set or not enabled, setup context entries for
2541     * identity mappings for rmrr, gfx, and isa and may fall back to static
2542     * identity mapping if iommu_identity_mapping is set.
2543     */
2544    if (iommu_identity_mapping) {
2545        ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2546        if (ret) {
2547            printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2548            goto error;
2549        }
2550    }
2551    /*
2552     * For each rmrr
2553     * for each dev attached to rmrr
2554     * do
2555     * locate drhd for dev, alloc domain for dev
2556     * allocate free domain
2557     * allocate page table entries for rmrr
2558     * if context not allocated for bus
2559     * allocate and init context
2560     * set present in root table for this bus
2561     * init context with domain, translation etc
2562     * endfor
2563     * endfor
2564     */
2565    printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2566    for_each_rmrr_units(rmrr) {
2567        for (i = 0; i < rmrr->devices_cnt; i++) {
2568            pdev = rmrr->devices[i];
2569            /*
2570             * some BIOS lists non-exist devices in DMAR
2571             * table.
2572             */
2573            if (!pdev)
2574                continue;
2575            ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2576            if (ret)
2577                printk(KERN_ERR
2578                       "IOMMU: mapping reserved region failed\n");
2579        }
2580    }
2581
2582    iommu_prepare_isa();
2583
2584    /*
2585     * for each drhd
2586     * enable fault log
2587     * global invalidate context cache
2588     * global invalidate iotlb
2589     * enable translation
2590     */
2591    for_each_drhd_unit(drhd) {
2592        if (drhd->ignored) {
2593            /*
2594             * we always have to disable PMRs or DMA may fail on
2595             * this device
2596             */
2597            if (force_on)
2598                iommu_disable_protect_mem_regions(drhd->iommu);
2599            continue;
2600        }
2601        iommu = drhd->iommu;
2602
2603        iommu_flush_write_buffer(iommu);
2604
2605        ret = dmar_set_interrupt(iommu);
2606        if (ret)
2607            goto error;
2608
2609        iommu_set_root_entry(iommu);
2610
2611        iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2612        iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2613
2614        ret = iommu_enable_translation(iommu);
2615        if (ret)
2616            goto error;
2617
2618        iommu_disable_protect_mem_regions(iommu);
2619    }
2620
2621    return 0;
2622error:
2623    for_each_drhd_unit(drhd) {
2624        if (drhd->ignored)
2625            continue;
2626        iommu = drhd->iommu;
2627        free_iommu(iommu);
2628    }
2629    kfree(g_iommus);
2630    return ret;
2631}
2632
2633/* This takes a number of _MM_ pages, not VTD pages */
2634static struct iova *intel_alloc_iova(struct device *dev,
2635                     struct dmar_domain *domain,
2636                     unsigned long nrpages, uint64_t dma_mask)
2637{
2638    struct pci_dev *pdev = to_pci_dev(dev);
2639    struct iova *iova = NULL;
2640
2641    /* Restrict dma_mask to the width that the iommu can handle */
2642    dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2643
2644    if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2645        /*
2646         * First try to allocate an io virtual address in
2647         * DMA_BIT_MASK(32) and if that fails then try allocating
2648         * from higher range
2649         */
2650        iova = alloc_iova(&domain->iovad, nrpages,
2651                  IOVA_PFN(DMA_BIT_MASK(32)), 1);
2652        if (iova)
2653            return iova;
2654    }
2655    iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2656    if (unlikely(!iova)) {
2657        printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2658               nrpages, pci_name(pdev));
2659        return NULL;
2660    }
2661
2662    return iova;
2663}
2664
2665static struct dmar_domain *__get_valid_domain_for_dev(struct pci_dev *pdev)
2666{
2667    struct dmar_domain *domain;
2668    int ret;
2669
2670    domain = get_domain_for_dev(pdev,
2671            DEFAULT_DOMAIN_ADDRESS_WIDTH);
2672    if (!domain) {
2673        printk(KERN_ERR
2674            "Allocating domain for %s failed", pci_name(pdev));
2675        return NULL;
2676    }
2677
2678    /* make sure context mapping is ok */
2679    if (unlikely(!domain_context_mapped(pdev))) {
2680        ret = domain_context_mapping(domain, pdev,
2681                         CONTEXT_TT_MULTI_LEVEL);
2682        if (ret) {
2683            printk(KERN_ERR
2684                "Domain context map for %s failed",
2685                pci_name(pdev));
2686            return NULL;
2687        }
2688    }
2689
2690    return domain;
2691}
2692
2693static inline struct dmar_domain *get_valid_domain_for_dev(struct pci_dev *dev)
2694{
2695    struct device_domain_info *info;
2696
2697    /* No lock here, assumes no domain exit in normal case */
2698    info = dev->dev.archdata.iommu;
2699    if (likely(info))
2700        return info->domain;
2701
2702    return __get_valid_domain_for_dev(dev);
2703}
2704
2705static int iommu_dummy(struct pci_dev *pdev)
2706{
2707    return pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2708}
2709
2710/* Check if the pdev needs to go through non-identity map and unmap process.*/
2711static int iommu_no_mapping(struct device *dev)
2712{
2713    struct pci_dev *pdev;
2714    int found;
2715
2716    if (unlikely(dev->bus != &pci_bus_type))
2717        return 1;
2718
2719    pdev = to_pci_dev(dev);
2720    if (iommu_dummy(pdev))
2721        return 1;
2722
2723    if (!iommu_identity_mapping)
2724        return 0;
2725
2726    found = identity_mapping(pdev);
2727    if (found) {
2728        if (iommu_should_identity_map(pdev, 0))
2729            return 1;
2730        else {
2731            /*
2732             * 32 bit DMA is removed from si_domain and fall back
2733             * to non-identity mapping.
2734             */
2735            domain_remove_one_dev_info(si_domain, pdev);
2736            printk(KERN_INFO "32bit %s uses non-identity mapping\n",
2737                   pci_name(pdev));
2738            return 0;
2739        }
2740    } else {
2741        /*
2742         * In case of a detached 64 bit DMA device from vm, the device
2743         * is put into si_domain for identity mapping.
2744         */
2745        if (iommu_should_identity_map(pdev, 0)) {
2746            int ret;
2747            ret = domain_add_dev_info(si_domain, pdev,
2748                          hw_pass_through ?
2749                          CONTEXT_TT_PASS_THROUGH :
2750                          CONTEXT_TT_MULTI_LEVEL);
2751            if (!ret) {
2752                printk(KERN_INFO "64bit %s uses identity mapping\n",
2753                       pci_name(pdev));
2754                return 1;
2755            }
2756        }
2757    }
2758
2759    return 0;
2760}
2761
2762static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2763                     size_t size, int dir, u64 dma_mask)
2764{
2765    struct pci_dev *pdev = to_pci_dev(hwdev);
2766    struct dmar_domain *domain;
2767    phys_addr_t start_paddr;
2768    struct iova *iova;
2769    int prot = 0;
2770    int ret;
2771    struct intel_iommu *iommu;
2772    unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
2773
2774    BUG_ON(dir == DMA_NONE);
2775
2776    if (iommu_no_mapping(hwdev))
2777        return paddr;
2778
2779    domain = get_valid_domain_for_dev(pdev);
2780    if (!domain)
2781        return 0;
2782
2783    iommu = domain_get_iommu(domain);
2784    size = aligned_nrpages(paddr, size);
2785
2786    iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size), dma_mask);
2787    if (!iova)
2788        goto error;
2789
2790    /*
2791     * Check if DMAR supports zero-length reads on write only
2792     * mappings..
2793     */
2794    if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2795            !cap_zlr(iommu->cap))
2796        prot |= DMA_PTE_READ;
2797    if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2798        prot |= DMA_PTE_WRITE;
2799    /*
2800     * paddr - (paddr + size) might be partial page, we should map the whole
2801     * page. Note: if two part of one page are separately mapped, we
2802     * might have two guest_addr mapping to the same host paddr, but this
2803     * is not a big problem
2804     */
2805    ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
2806                 mm_to_dma_pfn(paddr_pfn), size, prot);
2807    if (ret)
2808        goto error;
2809
2810    /* it's a non-present to present mapping. Only flush if caching mode */
2811    if (cap_caching_mode(iommu->cap))
2812        iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 1);
2813    else
2814        iommu_flush_write_buffer(iommu);
2815
2816    start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2817    start_paddr += paddr & ~PAGE_MASK;
2818    return start_paddr;
2819
2820error:
2821    if (iova)
2822        __free_iova(&domain->iovad, iova);
2823    printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
2824        pci_name(pdev), size, (unsigned long long)paddr, dir);
2825    return 0;
2826}
2827
2828static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2829                 unsigned long offset, size_t size,
2830                 enum dma_data_direction dir,
2831                 struct dma_attrs *attrs)
2832{
2833    return __intel_map_single(dev, page_to_phys(page) + offset, size,
2834                  dir, to_pci_dev(dev)->dma_mask);
2835}
2836
2837static void flush_unmaps(void)
2838{
2839    int i, j;
2840
2841    timer_on = 0;
2842
2843    /* just flush them all */
2844    for (i = 0; i < g_num_of_iommus; i++) {
2845        struct intel_iommu *iommu = g_iommus[i];
2846        if (!iommu)
2847            continue;
2848
2849        if (!deferred_flush[i].next)
2850            continue;
2851
2852        /* In caching mode, global flushes turn emulation expensive */
2853        if (!cap_caching_mode(iommu->cap))
2854            iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2855                     DMA_TLB_GLOBAL_FLUSH);
2856        for (j = 0; j < deferred_flush[i].next; j++) {
2857            unsigned long mask;
2858            struct iova *iova = deferred_flush[i].iova[j];
2859            struct dmar_domain *domain = deferred_flush[i].domain[j];
2860
2861            /* On real hardware multiple invalidations are expensive */
2862            if (cap_caching_mode(iommu->cap))
2863                iommu_flush_iotlb_psi(iommu, domain->id,
2864                iova->pfn_lo, iova->pfn_hi - iova->pfn_lo + 1, 0);
2865            else {
2866                mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
2867                iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
2868                        (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
2869            }
2870            __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
2871        }
2872        deferred_flush[i].next = 0;
2873    }
2874
2875    list_size = 0;
2876}
2877
2878static void flush_unmaps_timeout(unsigned long data)
2879{
2880    unsigned long flags;
2881
2882    spin_lock_irqsave(&async_umap_flush_lock, flags);
2883    flush_unmaps();
2884    spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2885}
2886
2887static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2888{
2889    unsigned long flags;
2890    int next, iommu_id;
2891    struct intel_iommu *iommu;
2892
2893    spin_lock_irqsave(&async_umap_flush_lock, flags);
2894    if (list_size == HIGH_WATER_MARK)
2895        flush_unmaps();
2896
2897    iommu = domain_get_iommu(dom);
2898    iommu_id = iommu->seq_id;
2899
2900    next = deferred_flush[iommu_id].next;
2901    deferred_flush[iommu_id].domain[next] = dom;
2902    deferred_flush[iommu_id].iova[next] = iova;
2903    deferred_flush[iommu_id].next++;
2904
2905    if (!timer_on) {
2906        mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2907        timer_on = 1;
2908    }
2909    list_size++;
2910    spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2911}
2912
2913static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
2914                 size_t size, enum dma_data_direction dir,
2915                 struct dma_attrs *attrs)
2916{
2917    struct pci_dev *pdev = to_pci_dev(dev);
2918    struct dmar_domain *domain;
2919    unsigned long start_pfn, last_pfn;
2920    struct iova *iova;
2921    struct intel_iommu *iommu;
2922
2923    if (iommu_no_mapping(dev))
2924        return;
2925
2926    domain = find_domain(pdev);
2927    BUG_ON(!domain);
2928
2929    iommu = domain_get_iommu(domain);
2930
2931    iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2932    if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
2933              (unsigned long long)dev_addr))
2934        return;
2935
2936    start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2937    last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2938
2939    pr_debug("Device %s unmapping: pfn %lx-%lx\n",
2940         pci_name(pdev), start_pfn, last_pfn);
2941
2942    /* clear the whole page */
2943    dma_pte_clear_range(domain, start_pfn, last_pfn);
2944
2945    /* free page tables */
2946    dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2947
2948    if (intel_iommu_strict) {
2949        iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2950                      last_pfn - start_pfn + 1, 0);
2951        /* free iova */
2952        __free_iova(&domain->iovad, iova);
2953    } else {
2954        add_unmap(domain, iova);
2955        /*
2956         * queue up the release of the unmap to save the 1/6th of the
2957         * cpu used up by the iotlb flush operation...
2958         */
2959    }
2960}
2961
2962static void *intel_alloc_coherent(struct device *hwdev, size_t size,
2963                  dma_addr_t *dma_handle, gfp_t flags,
2964                  struct dma_attrs *attrs)
2965{
2966    void *vaddr;
2967    int order;
2968
2969    size = PAGE_ALIGN(size);
2970    order = get_order(size);
2971
2972    if (!iommu_no_mapping(hwdev))
2973        flags &= ~(GFP_DMA | GFP_DMA32);
2974    else if (hwdev->coherent_dma_mask < dma_get_required_mask(hwdev)) {
2975        if (hwdev->coherent_dma_mask < DMA_BIT_MASK(32))
2976            flags |= GFP_DMA;
2977        else
2978            flags |= GFP_DMA32;
2979    }
2980
2981    vaddr = (void *)__get_free_pages(flags, order);
2982    if (!vaddr)
2983        return NULL;
2984    memset(vaddr, 0, size);
2985
2986    *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2987                     DMA_BIDIRECTIONAL,
2988                     hwdev->coherent_dma_mask);
2989    if (*dma_handle)
2990        return vaddr;
2991    free_pages((unsigned long)vaddr, order);
2992    return NULL;
2993}
2994
2995static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2996                dma_addr_t dma_handle, struct dma_attrs *attrs)
2997{
2998    int order;
2999
3000    size = PAGE_ALIGN(size);
3001    order = get_order(size);
3002
3003    intel_unmap_page(hwdev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
3004    free_pages((unsigned long)vaddr, order);
3005}
3006
3007static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
3008               int nelems, enum dma_data_direction dir,
3009               struct dma_attrs *attrs)
3010{
3011    struct pci_dev *pdev = to_pci_dev(hwdev);
3012    struct dmar_domain *domain;
3013    unsigned long start_pfn, last_pfn;
3014    struct iova *iova;
3015    struct intel_iommu *iommu;
3016
3017    if (iommu_no_mapping(hwdev))
3018        return;
3019
3020    domain = find_domain(pdev);
3021    BUG_ON(!domain);
3022
3023    iommu = domain_get_iommu(domain);
3024
3025    iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
3026    if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
3027              (unsigned long long)sglist[0].dma_address))
3028        return;
3029
3030    start_pfn = mm_to_dma_pfn(iova->pfn_lo);
3031    last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
3032
3033    /* clear the whole page */
3034    dma_pte_clear_range(domain, start_pfn, last_pfn);
3035
3036    /* free page tables */
3037    dma_pte_free_pagetable(domain, start_pfn, last_pfn);
3038
3039    if (intel_iommu_strict) {
3040        iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
3041                      last_pfn - start_pfn + 1, 0);
3042        /* free iova */
3043        __free_iova(&domain->iovad, iova);
3044    } else {
3045        add_unmap(domain, iova);
3046        /*
3047         * queue up the release of the unmap to save the 1/6th of the
3048         * cpu used up by the iotlb flush operation...
3049         */
3050    }
3051}
3052
3053static int intel_nontranslate_map_sg(struct device *hddev,
3054    struct scatterlist *sglist, int nelems, int dir)
3055{
3056    int i;
3057    struct scatterlist *sg;
3058
3059    for_each_sg(sglist, sg, nelems, i) {
3060        BUG_ON(!sg_page(sg));
3061        sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
3062        sg->dma_length = sg->length;
3063    }
3064    return nelems;
3065}
3066
3067static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
3068            enum dma_data_direction dir, struct dma_attrs *attrs)
3069{
3070    int i;
3071    struct pci_dev *pdev = to_pci_dev(hwdev);
3072    struct dmar_domain *domain;
3073    size_t size = 0;
3074    int prot = 0;
3075    struct iova *iova = NULL;
3076    int ret;
3077    struct scatterlist *sg;
3078    unsigned long start_vpfn;
3079    struct intel_iommu *iommu;
3080
3081    BUG_ON(dir == DMA_NONE);
3082    if (iommu_no_mapping(hwdev))
3083        return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
3084
3085    domain = get_valid_domain_for_dev(pdev);
3086    if (!domain)
3087        return 0;
3088
3089    iommu = domain_get_iommu(domain);
3090
3091    for_each_sg(sglist, sg, nelems, i)
3092        size += aligned_nrpages(sg->offset, sg->length);
3093
3094    iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
3095                pdev->dma_mask);
3096    if (!iova) {
3097        sglist->dma_length = 0;
3098        return 0;
3099    }
3100
3101    /*
3102     * Check if DMAR supports zero-length reads on write only
3103     * mappings..
3104     */
3105    if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3106            !cap_zlr(iommu->cap))
3107        prot |= DMA_PTE_READ;
3108    if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3109        prot |= DMA_PTE_WRITE;
3110
3111    start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
3112
3113    ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3114    if (unlikely(ret)) {
3115        /* clear the page */
3116        dma_pte_clear_range(domain, start_vpfn,
3117                    start_vpfn + size - 1);
3118        /* free page tables */
3119        dma_pte_free_pagetable(domain, start_vpfn,
3120                       start_vpfn + size - 1);
3121        /* free iova */
3122        __free_iova(&domain->iovad, iova);
3123        return 0;
3124    }
3125
3126    /* it's a non-present to present mapping. Only flush if caching mode */
3127    if (cap_caching_mode(iommu->cap))
3128        iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 1);
3129    else
3130        iommu_flush_write_buffer(iommu);
3131
3132    return nelems;
3133}
3134
3135static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3136{
3137    return !dma_addr;
3138}
3139
3140struct dma_map_ops intel_dma_ops = {
3141    .alloc = intel_alloc_coherent,
3142    .free = intel_free_coherent,
3143    .map_sg = intel_map_sg,
3144    .unmap_sg = intel_unmap_sg,
3145    .map_page = intel_map_page,
3146    .unmap_page = intel_unmap_page,
3147    .mapping_error = intel_mapping_error,
3148};
3149
3150static inline int iommu_domain_cache_init(void)
3151{
3152    int ret = 0;
3153
3154    iommu_domain_cache = kmem_cache_create("iommu_domain",
3155                     sizeof(struct dmar_domain),
3156                     0,
3157                     SLAB_HWCACHE_ALIGN,
3158
3159                     NULL);
3160    if (!iommu_domain_cache) {
3161        printk(KERN_ERR "Couldn't create iommu_domain cache\n");
3162        ret = -ENOMEM;
3163    }
3164
3165    return ret;
3166}
3167
3168static inline int iommu_devinfo_cache_init(void)
3169{
3170    int ret = 0;
3171
3172    iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3173                     sizeof(struct device_domain_info),
3174                     0,
3175                     SLAB_HWCACHE_ALIGN,
3176                     NULL);
3177    if (!iommu_devinfo_cache) {
3178        printk(KERN_ERR "Couldn't create devinfo cache\n");
3179        ret = -ENOMEM;
3180    }
3181
3182    return ret;
3183}
3184
3185static inline int iommu_iova_cache_init(void)
3186{
3187    int ret = 0;
3188
3189    iommu_iova_cache = kmem_cache_create("iommu_iova",
3190                     sizeof(struct iova),
3191                     0,
3192                     SLAB_HWCACHE_ALIGN,
3193                     NULL);
3194    if (!iommu_iova_cache) {
3195        printk(KERN_ERR "Couldn't create iova cache\n");
3196        ret = -ENOMEM;
3197    }
3198
3199    return ret;
3200}
3201
3202static int __init iommu_init_mempool(void)
3203{
3204    int ret;
3205    ret = iommu_iova_cache_init();
3206    if (ret)
3207        return ret;
3208
3209    ret = iommu_domain_cache_init();
3210    if (ret)
3211        goto domain_error;
3212
3213    ret = iommu_devinfo_cache_init();
3214    if (!ret)
3215        return ret;
3216
3217    kmem_cache_destroy(iommu_domain_cache);
3218domain_error:
3219    kmem_cache_destroy(iommu_iova_cache);
3220
3221    return -ENOMEM;
3222}
3223
3224static void __init iommu_exit_mempool(void)
3225{
3226    kmem_cache_destroy(iommu_devinfo_cache);
3227    kmem_cache_destroy(iommu_domain_cache);
3228    kmem_cache_destroy(iommu_iova_cache);
3229
3230}
3231
3232static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3233{
3234    struct dmar_drhd_unit *drhd;
3235    u32 vtbar;
3236    int rc;
3237
3238    /* We know that this device on this chipset has its own IOMMU.
3239     * If we find it under a different IOMMU, then the BIOS is lying
3240     * to us. Hope that the IOMMU for this device is actually
3241     * disabled, and it needs no translation...
3242     */
3243    rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3244    if (rc) {
3245        /* "can't" happen */
3246        dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3247        return;
3248    }
3249    vtbar &= 0xffff0000;
3250
3251    /* we know that the this iommu should be at offset 0xa000 from vtbar */
3252    drhd = dmar_find_matched_drhd_unit(pdev);
3253    if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3254                TAINT_FIRMWARE_WORKAROUND,
3255                "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3256        pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3257}
3258DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3259
3260static void __init init_no_remapping_devices(void)
3261{
3262    struct dmar_drhd_unit *drhd;
3263
3264    for_each_drhd_unit(drhd) {
3265        if (!drhd->include_all) {
3266            int i;
3267            for (i = 0; i < drhd->devices_cnt; i++)
3268                if (drhd->devices[i] != NULL)
3269                    break;
3270            /* ignore DMAR unit if no pci devices exist */
3271            if (i == drhd->devices_cnt)
3272                drhd->ignored = 1;
3273        }
3274    }
3275
3276    for_each_drhd_unit(drhd) {
3277        int i;
3278        if (drhd->ignored || drhd->include_all)
3279            continue;
3280
3281        for (i = 0; i < drhd->devices_cnt; i++)
3282            if (drhd->devices[i] &&
3283                !IS_GFX_DEVICE(drhd->devices[i]))
3284                break;
3285
3286        if (i < drhd->devices_cnt)
3287            continue;
3288
3289        /* This IOMMU has *only* gfx devices. Either bypass it or
3290           set the gfx_mapped flag, as appropriate */
3291        if (dmar_map_gfx) {
3292            intel_iommu_gfx_mapped = 1;
3293        } else {
3294            drhd->ignored = 1;
3295            for (i = 0; i < drhd->devices_cnt; i++) {
3296                if (!drhd->devices[i])
3297                    continue;
3298                drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3299            }
3300        }
3301    }
3302}
3303
3304#ifdef CONFIG_SUSPEND
3305static int init_iommu_hw(void)
3306{
3307    struct dmar_drhd_unit *drhd;
3308    struct intel_iommu *iommu = NULL;
3309
3310    for_each_active_iommu(iommu, drhd)
3311        if (iommu->qi)
3312            dmar_reenable_qi(iommu);
3313
3314    for_each_iommu(iommu, drhd) {
3315        if (drhd->ignored) {
3316            /*
3317             * we always have to disable PMRs or DMA may fail on
3318             * this device
3319             */
3320            if (force_on)
3321                iommu_disable_protect_mem_regions(iommu);
3322            continue;
3323        }
3324    
3325        iommu_flush_write_buffer(iommu);
3326
3327        iommu_set_root_entry(iommu);
3328
3329        iommu->flush.flush_context(iommu, 0, 0, 0,
3330                       DMA_CCMD_GLOBAL_INVL);
3331        iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3332                     DMA_TLB_GLOBAL_FLUSH);
3333        if (iommu_enable_translation(iommu))
3334            return 1;
3335        iommu_disable_protect_mem_regions(iommu);
3336    }
3337
3338    return 0;
3339}
3340
3341static void iommu_flush_all(void)
3342{
3343    struct dmar_drhd_unit *drhd;
3344    struct intel_iommu *iommu;
3345
3346    for_each_active_iommu(iommu, drhd) {
3347        iommu->flush.flush_context(iommu, 0, 0, 0,
3348                       DMA_CCMD_GLOBAL_INVL);
3349        iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3350                     DMA_TLB_GLOBAL_FLUSH);
3351    }
3352}
3353
3354static int iommu_suspend(void)
3355{
3356    struct dmar_drhd_unit *drhd;
3357    struct intel_iommu *iommu = NULL;
3358    unsigned long flag;
3359
3360    for_each_active_iommu(iommu, drhd) {
3361        iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3362                         GFP_ATOMIC);
3363        if (!iommu->iommu_state)
3364            goto nomem;
3365    }
3366
3367    iommu_flush_all();
3368
3369    for_each_active_iommu(iommu, drhd) {
3370        iommu_disable_translation(iommu);
3371
3372        raw_spin_lock_irqsave(&iommu->register_lock, flag);
3373
3374        iommu->iommu_state[SR_DMAR_FECTL_REG] =
3375            readl(iommu->reg + DMAR_FECTL_REG);
3376        iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3377            readl(iommu->reg + DMAR_FEDATA_REG);
3378        iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3379            readl(iommu->reg + DMAR_FEADDR_REG);
3380        iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3381            readl(iommu->reg + DMAR_FEUADDR_REG);
3382
3383        raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3384    }
3385    return 0;
3386
3387nomem:
3388    for_each_active_iommu(iommu, drhd)
3389        kfree(iommu->iommu_state);
3390
3391    return -ENOMEM;
3392}
3393
3394static void iommu_resume(void)
3395{
3396    struct dmar_drhd_unit *drhd;
3397    struct intel_iommu *iommu = NULL;
3398    unsigned long flag;
3399
3400    if (init_iommu_hw()) {
3401        if (force_on)
3402            panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3403        else
3404            WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3405        return;
3406    }
3407
3408    for_each_active_iommu(iommu, drhd) {
3409
3410        raw_spin_lock_irqsave(&iommu->register_lock, flag);
3411
3412        writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3413            iommu->reg + DMAR_FECTL_REG);
3414        writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3415            iommu->reg + DMAR_FEDATA_REG);
3416        writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3417            iommu->reg + DMAR_FEADDR_REG);
3418        writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3419            iommu->reg + DMAR_FEUADDR_REG);
3420
3421        raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3422    }
3423
3424    for_each_active_iommu(iommu, drhd)
3425        kfree(iommu->iommu_state);
3426}
3427
3428static struct syscore_ops iommu_syscore_ops = {
3429    .resume = iommu_resume,
3430    .suspend = iommu_suspend,
3431};
3432
3433static void __init init_iommu_pm_ops(void)
3434{
3435    register_syscore_ops(&iommu_syscore_ops);
3436}
3437
3438#else
3439static inline void init_iommu_pm_ops(void) {}
3440#endif /* CONFIG_PM */
3441
3442LIST_HEAD(dmar_rmrr_units);
3443
3444static void __init dmar_register_rmrr_unit(struct dmar_rmrr_unit *rmrr)
3445{
3446    list_add(&rmrr->list, &dmar_rmrr_units);
3447}
3448
3449
3450int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header)
3451{
3452    struct acpi_dmar_reserved_memory *rmrr;
3453    struct dmar_rmrr_unit *rmrru;
3454
3455    rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3456    if (!rmrru)
3457        return -ENOMEM;
3458
3459    rmrru->hdr = header;
3460    rmrr = (struct acpi_dmar_reserved_memory *)header;
3461    rmrru->base_address = rmrr->base_address;
3462    rmrru->end_address = rmrr->end_address;
3463
3464    dmar_register_rmrr_unit(rmrru);
3465    return 0;
3466}
3467
3468static int __init
3469rmrr_parse_dev(struct dmar_rmrr_unit *rmrru)
3470{
3471    struct acpi_dmar_reserved_memory *rmrr;
3472    int ret;
3473
3474    rmrr = (struct acpi_dmar_reserved_memory *) rmrru->hdr;
3475    ret = dmar_parse_dev_scope((void *)(rmrr + 1),
3476        ((void *)rmrr) + rmrr->header.length,
3477        &rmrru->devices_cnt, &rmrru->devices, rmrr->segment);
3478
3479    if (ret || (rmrru->devices_cnt == 0)) {
3480        list_del(&rmrru->list);
3481        kfree(rmrru);
3482    }
3483    return ret;
3484}
3485
3486static LIST_HEAD(dmar_atsr_units);
3487
3488int __init dmar_parse_one_atsr(struct acpi_dmar_header *hdr)
3489{
3490    struct acpi_dmar_atsr *atsr;
3491    struct dmar_atsr_unit *atsru;
3492
3493    atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3494    atsru = kzalloc(sizeof(*atsru), GFP_KERNEL);
3495    if (!atsru)
3496        return -ENOMEM;
3497
3498    atsru->hdr = hdr;
3499    atsru->include_all = atsr->flags & 0x1;
3500
3501    list_add(&atsru->list, &dmar_atsr_units);
3502
3503    return 0;
3504}
3505
3506static int __init atsr_parse_dev(struct dmar_atsr_unit *atsru)
3507{
3508    int rc;
3509    struct acpi_dmar_atsr *atsr;
3510
3511    if (atsru->include_all)
3512        return 0;
3513
3514    atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3515    rc = dmar_parse_dev_scope((void *)(atsr + 1),
3516                (void *)atsr + atsr->header.length,
3517                &atsru->devices_cnt, &atsru->devices,
3518                atsr->segment);
3519    if (rc || !atsru->devices_cnt) {
3520        list_del(&atsru->list);
3521        kfree(atsru);
3522    }
3523
3524    return rc;
3525}
3526
3527int dmar_find_matched_atsr_unit(struct pci_dev *dev)
3528{
3529    int i;
3530    struct pci_bus *bus;
3531    struct acpi_dmar_atsr *atsr;
3532    struct dmar_atsr_unit *atsru;
3533
3534    dev = pci_physfn(dev);
3535
3536    list_for_each_entry(atsru, &dmar_atsr_units, list) {
3537        atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3538        if (atsr->segment == pci_domain_nr(dev->bus))
3539            goto found;
3540    }
3541
3542    return 0;
3543
3544found:
3545    for (bus = dev->bus; bus; bus = bus->parent) {
3546        struct pci_dev *bridge = bus->self;
3547
3548        if (!bridge || !pci_is_pcie(bridge) ||
3549            bridge->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
3550            return 0;
3551
3552        if (bridge->pcie_type == PCI_EXP_TYPE_ROOT_PORT) {
3553            for (i = 0; i < atsru->devices_cnt; i++)
3554                if (atsru->devices[i] == bridge)
3555                    return 1;
3556            break;
3557        }
3558    }
3559
3560    if (atsru->include_all)
3561        return 1;
3562
3563    return 0;
3564}
3565
3566int __init dmar_parse_rmrr_atsr_dev(void)
3567{
3568    struct dmar_rmrr_unit *rmrr, *rmrr_n;
3569    struct dmar_atsr_unit *atsr, *atsr_n;
3570    int ret = 0;
3571
3572    list_for_each_entry_safe(rmrr, rmrr_n, &dmar_rmrr_units, list) {
3573        ret = rmrr_parse_dev(rmrr);
3574        if (ret)
3575            return ret;
3576    }
3577
3578    list_for_each_entry_safe(atsr, atsr_n, &dmar_atsr_units, list) {
3579        ret = atsr_parse_dev(atsr);
3580        if (ret)
3581            return ret;
3582    }
3583
3584    return ret;
3585}
3586
3587/*
3588 * Here we only respond to action of unbound device from driver.
3589 *
3590 * Added device is not attached to its DMAR domain here yet. That will happen
3591 * when mapping the device to iova.
3592 */
3593static int device_notifier(struct notifier_block *nb,
3594                  unsigned long action, void *data)
3595{
3596    struct device *dev = data;
3597    struct pci_dev *pdev = to_pci_dev(dev);
3598    struct dmar_domain *domain;
3599
3600    if (iommu_no_mapping(dev))
3601        return 0;
3602
3603    domain = find_domain(pdev);
3604    if (!domain)
3605        return 0;
3606
3607    if (action == BUS_NOTIFY_UNBOUND_DRIVER && !iommu_pass_through) {
3608        domain_remove_one_dev_info(domain, pdev);
3609
3610        if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3611            !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
3612            list_empty(&domain->devices))
3613            domain_exit(domain);
3614    }
3615
3616    return 0;
3617}
3618
3619static struct notifier_block device_nb = {
3620    .notifier_call = device_notifier,
3621};
3622
3623int __init intel_iommu_init(void)
3624{
3625    int ret = 0;
3626
3627    /* VT-d is required for a TXT/tboot launch, so enforce that */
3628    force_on = tboot_force_iommu();
3629
3630    if (dmar_table_init()) {
3631        if (force_on)
3632            panic("tboot: Failed to initialize DMAR table\n");
3633        return -ENODEV;
3634    }
3635
3636    if (dmar_dev_scope_init() < 0) {
3637        if (force_on)
3638            panic("tboot: Failed to initialize DMAR device scope\n");
3639        return -ENODEV;
3640    }
3641
3642    if (no_iommu || dmar_disabled)
3643        return -ENODEV;
3644
3645    if (iommu_init_mempool()) {
3646        if (force_on)
3647            panic("tboot: Failed to initialize iommu memory\n");
3648        return -ENODEV;
3649    }
3650
3651    if (list_empty(&dmar_rmrr_units))
3652        printk(KERN_INFO "DMAR: No RMRR found\n");
3653
3654    if (list_empty(&dmar_atsr_units))
3655        printk(KERN_INFO "DMAR: No ATSR found\n");
3656
3657    if (dmar_init_reserved_ranges()) {
3658        if (force_on)
3659            panic("tboot: Failed to reserve iommu ranges\n");
3660        return -ENODEV;
3661    }
3662
3663    init_no_remapping_devices();
3664
3665    ret = init_dmars();
3666    if (ret) {
3667        if (force_on)
3668            panic("tboot: Failed to initialize DMARs\n");
3669        printk(KERN_ERR "IOMMU: dmar init failed\n");
3670        put_iova_domain(&reserved_iova_list);
3671        iommu_exit_mempool();
3672        return ret;
3673    }
3674    printk(KERN_INFO
3675    "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
3676
3677    init_timer(&unmap_timer);
3678#ifdef CONFIG_SWIOTLB
3679    swiotlb = 0;
3680#endif
3681    dma_ops = &intel_dma_ops;
3682
3683    init_iommu_pm_ops();
3684
3685    bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
3686
3687    bus_register_notifier(&pci_bus_type, &device_nb);
3688
3689    intel_iommu_enabled = 1;
3690
3691    return 0;
3692}
3693
3694static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
3695                       struct pci_dev *pdev)
3696{
3697    struct pci_dev *tmp, *parent;
3698
3699    if (!iommu || !pdev)
3700        return;
3701
3702    /* dependent device detach */
3703    tmp = pci_find_upstream_pcie_bridge(pdev);
3704    /* Secondary interface's bus number and devfn 0 */
3705    if (tmp) {
3706        parent = pdev->bus->self;
3707        while (parent != tmp) {
3708            iommu_detach_dev(iommu, parent->bus->number,
3709                     parent->devfn);
3710            parent = parent->bus->self;
3711        }
3712        if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
3713            iommu_detach_dev(iommu,
3714                tmp->subordinate->number, 0);
3715        else /* this is a legacy PCI bridge */
3716            iommu_detach_dev(iommu, tmp->bus->number,
3717                     tmp->devfn);
3718    }
3719}
3720
3721static void domain_remove_one_dev_info(struct dmar_domain *domain,
3722                      struct pci_dev *pdev)
3723{
3724    struct device_domain_info *info;
3725    struct intel_iommu *iommu;
3726    unsigned long flags;
3727    int found = 0;
3728    struct list_head *entry, *tmp;
3729
3730    iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3731                pdev->devfn);
3732    if (!iommu)
3733        return;
3734
3735    spin_lock_irqsave(&device_domain_lock, flags);
3736    list_for_each_safe(entry, tmp, &domain->devices) {
3737        info = list_entry(entry, struct device_domain_info, link);
3738        if (info->segment == pci_domain_nr(pdev->bus) &&
3739            info->bus == pdev->bus->number &&
3740            info->devfn == pdev->devfn) {
3741            unlink_domain_info(info);
3742            spin_unlock_irqrestore(&device_domain_lock, flags);
3743
3744            iommu_disable_dev_iotlb(info);
3745            iommu_detach_dev(iommu, info->bus, info->devfn);
3746            iommu_detach_dependent_devices(iommu, pdev);
3747            free_devinfo_mem(info);
3748
3749            spin_lock_irqsave(&device_domain_lock, flags);
3750
3751            if (found)
3752                break;
3753            else
3754                continue;
3755        }
3756
3757        /* if there is no other devices under the same iommu
3758         * owned by this domain, clear this iommu in iommu_bmp
3759         * update iommu count and coherency
3760         */
3761        if (iommu == device_to_iommu(info->segment, info->bus,
3762                        info->devfn))
3763            found = 1;
3764    }
3765
3766    spin_unlock_irqrestore(&device_domain_lock, flags);
3767
3768    if (found == 0) {
3769        unsigned long tmp_flags;
3770        spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
3771        clear_bit(iommu->seq_id, domain->iommu_bmp);
3772        domain->iommu_count--;
3773        domain_update_iommu_cap(domain);
3774        spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
3775
3776        if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3777            !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)) {
3778            spin_lock_irqsave(&iommu->lock, tmp_flags);
3779            clear_bit(domain->id, iommu->domain_ids);
3780            iommu->domains[domain->id] = NULL;
3781            spin_unlock_irqrestore(&iommu->lock, tmp_flags);
3782        }
3783    }
3784}
3785
3786static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
3787{
3788    struct device_domain_info *info;
3789    struct intel_iommu *iommu;
3790    unsigned long flags1, flags2;
3791
3792    spin_lock_irqsave(&device_domain_lock, flags1);
3793    while (!list_empty(&domain->devices)) {
3794        info = list_entry(domain->devices.next,
3795            struct device_domain_info, link);
3796        unlink_domain_info(info);
3797        spin_unlock_irqrestore(&device_domain_lock, flags1);
3798
3799        iommu_disable_dev_iotlb(info);
3800        iommu = device_to_iommu(info->segment, info->bus, info->devfn);
3801        iommu_detach_dev(iommu, info->bus, info->devfn);
3802        iommu_detach_dependent_devices(iommu, info->dev);
3803
3804        /* clear this iommu in iommu_bmp, update iommu count
3805         * and capabilities
3806         */
3807        spin_lock_irqsave(&domain->iommu_lock, flags2);
3808        if (test_and_clear_bit(iommu->seq_id,
3809                       domain->iommu_bmp)) {
3810            domain->iommu_count--;
3811            domain_update_iommu_cap(domain);
3812        }
3813        spin_unlock_irqrestore(&domain->iommu_lock, flags2);
3814
3815        free_devinfo_mem(info);
3816        spin_lock_irqsave(&device_domain_lock, flags1);
3817    }
3818    spin_unlock_irqrestore(&device_domain_lock, flags1);
3819}
3820
3821/* domain id for virtual machine, it won't be set in context */
3822static unsigned long vm_domid;
3823
3824static struct dmar_domain *iommu_alloc_vm_domain(void)
3825{
3826    struct dmar_domain *domain;
3827
3828    domain = alloc_domain_mem();
3829    if (!domain)
3830        return NULL;
3831
3832    domain->id = vm_domid++;
3833    domain->nid = -1;
3834    memset(domain->iommu_bmp, 0, sizeof(domain->iommu_bmp));
3835    domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
3836
3837    return domain;
3838}
3839
3840static int md_domain_init(struct dmar_domain *domain, int guest_width)
3841{
3842    int adjust_width;
3843
3844    init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
3845    spin_lock_init(&domain->iommu_lock);
3846
3847    domain_reserve_special_ranges(domain);
3848
3849    /* calculate AGAW */
3850    domain->gaw = guest_width;
3851    adjust_width = guestwidth_to_adjustwidth(guest_width);
3852    domain->agaw = width_to_agaw(adjust_width);
3853
3854    INIT_LIST_HEAD(&domain->devices);
3855
3856    domain->iommu_count = 0;
3857    domain->iommu_coherency = 0;
3858    domain->iommu_snooping = 0;
3859    domain->iommu_superpage = 0;
3860    domain->max_addr = 0;
3861    domain->nid = -1;
3862
3863    /* always allocate the top pgd */
3864    domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
3865    if (!domain->pgd)
3866        return -ENOMEM;
3867    domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3868    return 0;
3869}
3870
3871static void iommu_free_vm_domain(struct dmar_domain *domain)
3872{
3873    unsigned long flags;
3874    struct dmar_drhd_unit *drhd;
3875    struct intel_iommu *iommu;
3876    unsigned long i;
3877    unsigned long ndomains;
3878
3879    for_each_drhd_unit(drhd) {
3880        if (drhd->ignored)
3881            continue;
3882        iommu = drhd->iommu;
3883
3884        ndomains = cap_ndoms(iommu->cap);
3885        for_each_set_bit(i, iommu->domain_ids, ndomains) {
3886            if (iommu->domains[i] == domain) {
3887                spin_lock_irqsave(&iommu->lock, flags);
3888                clear_bit(i, iommu->domain_ids);
3889                iommu->domains[i] = NULL;
3890                spin_unlock_irqrestore(&iommu->lock, flags);
3891                break;
3892            }
3893        }
3894    }
3895}
3896
3897static void vm_domain_exit(struct dmar_domain *domain)
3898{
3899    /* Domain 0 is reserved, so dont process it */
3900    if (!domain)
3901        return;
3902
3903    vm_domain_remove_all_dev_info(domain);
3904    /* destroy iovas */
3905    put_iova_domain(&domain->iovad);
3906
3907    /* clear ptes */
3908    dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3909
3910    /* free page tables */
3911    dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3912
3913    iommu_free_vm_domain(domain);
3914    free_domain_mem(domain);
3915}
3916
3917static int intel_iommu_domain_init(struct iommu_domain *domain)
3918{
3919    struct dmar_domain *dmar_domain;
3920
3921    dmar_domain = iommu_alloc_vm_domain();
3922    if (!dmar_domain) {
3923        printk(KERN_ERR
3924            "intel_iommu_domain_init: dmar_domain == NULL\n");
3925        return -ENOMEM;
3926    }
3927    if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3928        printk(KERN_ERR
3929            "intel_iommu_domain_init() failed\n");
3930        vm_domain_exit(dmar_domain);
3931        return -ENOMEM;
3932    }
3933    domain_update_iommu_cap(dmar_domain);
3934    domain->priv = dmar_domain;
3935
3936    domain->geometry.aperture_start = 0;
3937    domain->geometry.aperture_end = __DOMAIN_MAX_ADDR(dmar_domain->gaw);
3938    domain->geometry.force_aperture = true;
3939
3940    return 0;
3941}
3942
3943static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3944{
3945    struct dmar_domain *dmar_domain = domain->priv;
3946
3947    domain->priv = NULL;
3948    vm_domain_exit(dmar_domain);
3949}
3950
3951static int intel_iommu_attach_device(struct iommu_domain *domain,
3952                     struct device *dev)
3953{
3954    struct dmar_domain *dmar_domain = domain->priv;
3955    struct pci_dev *pdev = to_pci_dev(dev);
3956    struct intel_iommu *iommu;
3957    int addr_width;
3958
3959    /* normally pdev is not mapped */
3960    if (unlikely(domain_context_mapped(pdev))) {
3961        struct dmar_domain *old_domain;
3962
3963        old_domain = find_domain(pdev);
3964        if (old_domain) {
3965            if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
3966                dmar_domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)
3967                domain_remove_one_dev_info(old_domain, pdev);
3968            else
3969                domain_remove_dev_info(old_domain);
3970        }
3971    }
3972
3973    iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3974                pdev->devfn);
3975    if (!iommu)
3976        return -ENODEV;
3977
3978    /* check if this iommu agaw is sufficient for max mapped address */
3979    addr_width = agaw_to_width(iommu->agaw);
3980    if (addr_width > cap_mgaw(iommu->cap))
3981        addr_width = cap_mgaw(iommu->cap);
3982
3983    if (dmar_domain->max_addr > (1LL << addr_width)) {
3984        printk(KERN_ERR "%s: iommu width (%d) is not "
3985               "sufficient for the mapped address (%llx)\n",
3986               __func__, addr_width, dmar_domain->max_addr);
3987        return -EFAULT;
3988    }
3989    dmar_domain->gaw = addr_width;
3990
3991    /*
3992     * Knock out extra levels of page tables if necessary
3993     */
3994    while (iommu->agaw < dmar_domain->agaw) {
3995        struct dma_pte *pte;
3996
3997        pte = dmar_domain->pgd;
3998        if (dma_pte_present(pte)) {
3999            dmar_domain->pgd = (struct dma_pte *)
4000                phys_to_virt(dma_pte_addr(pte));
4001            free_pgtable_page(pte);
4002        }
4003        dmar_domain->agaw--;
4004    }
4005
4006    return domain_add_dev_info(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
4007}
4008
4009static void intel_iommu_detach_device(struct iommu_domain *domain,
4010                      struct device *dev)
4011{
4012    struct dmar_domain *dmar_domain = domain->priv;
4013    struct pci_dev *pdev = to_pci_dev(dev);
4014
4015    domain_remove_one_dev_info(dmar_domain, pdev);
4016}
4017
4018static int intel_iommu_map(struct iommu_domain *domain,
4019               unsigned long iova, phys_addr_t hpa,
4020               size_t size, int iommu_prot)
4021{
4022    struct dmar_domain *dmar_domain = domain->priv;
4023    u64 max_addr;
4024    int prot = 0;
4025    int ret;
4026
4027    if (iommu_prot & IOMMU_READ)
4028        prot |= DMA_PTE_READ;
4029    if (iommu_prot & IOMMU_WRITE)
4030        prot |= DMA_PTE_WRITE;
4031    if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
4032        prot |= DMA_PTE_SNP;
4033
4034    max_addr = iova + size;
4035    if (dmar_domain->max_addr < max_addr) {
4036        u64 end;
4037
4038        /* check if minimum agaw is sufficient for mapped address */
4039        end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4040        if (end < max_addr) {
4041            printk(KERN_ERR "%s: iommu width (%d) is not "
4042                   "sufficient for the mapped address (%llx)\n",
4043                   __func__, dmar_domain->gaw, max_addr);
4044            return -EFAULT;
4045        }
4046        dmar_domain->max_addr = max_addr;
4047    }
4048    /* Round up size to next multiple of PAGE_SIZE, if it and
4049       the low bits of hpa would take us onto the next page */
4050    size = aligned_nrpages(hpa, size);
4051    ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4052                 hpa >> VTD_PAGE_SHIFT, size, prot);
4053    return ret;
4054}
4055
4056static size_t intel_iommu_unmap(struct iommu_domain *domain,
4057                 unsigned long iova, size_t size)
4058{
4059    struct dmar_domain *dmar_domain = domain->priv;
4060    int order;
4061
4062    order = dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT,
4063                (iova + size - 1) >> VTD_PAGE_SHIFT);
4064
4065    if (dmar_domain->max_addr == iova + size)
4066        dmar_domain->max_addr = iova;
4067
4068    return PAGE_SIZE << order;
4069}
4070
4071static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4072                        unsigned long iova)
4073{
4074    struct dmar_domain *dmar_domain = domain->priv;
4075    struct dma_pte *pte;
4076    u64 phys = 0;
4077
4078    pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, 0);
4079    if (pte)
4080        phys = dma_pte_addr(pte);
4081
4082    return phys;
4083}
4084
4085static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
4086                      unsigned long cap)
4087{
4088    struct dmar_domain *dmar_domain = domain->priv;
4089
4090    if (cap == IOMMU_CAP_CACHE_COHERENCY)
4091        return dmar_domain->iommu_snooping;
4092    if (cap == IOMMU_CAP_INTR_REMAP)
4093        return irq_remapping_enabled;
4094
4095    return 0;
4096}
4097
4098static void swap_pci_ref(struct pci_dev **from, struct pci_dev *to)
4099{
4100    pci_dev_put(*from);
4101    *from = to;
4102}
4103
4104#define REQ_ACS_FLAGS (PCI_ACS_SV | PCI_ACS_RR | PCI_ACS_CR | PCI_ACS_UF)
4105
4106static int intel_iommu_add_device(struct device *dev)
4107{
4108    struct pci_dev *pdev = to_pci_dev(dev);
4109    struct pci_dev *bridge, *dma_pdev;
4110    struct iommu_group *group;
4111    int ret;
4112
4113    if (!device_to_iommu(pci_domain_nr(pdev->bus),
4114                 pdev->bus->number, pdev->devfn))
4115        return -ENODEV;
4116
4117    bridge = pci_find_upstream_pcie_bridge(pdev);
4118    if (bridge) {
4119        if (pci_is_pcie(bridge))
4120            dma_pdev = pci_get_domain_bus_and_slot(
4121                        pci_domain_nr(pdev->bus),
4122                        bridge->subordinate->number, 0);
4123        else
4124            dma_pdev = pci_dev_get(bridge);
4125    } else
4126        dma_pdev = pci_dev_get(pdev);
4127
4128    /* Account for quirked devices */
4129    swap_pci_ref(&dma_pdev, pci_get_dma_source(dma_pdev));
4130
4131    /*
4132     * If it's a multifunction device that does not support our
4133     * required ACS flags, add to the same group as function 0.
4134     */
4135    if (dma_pdev->multifunction &&
4136        !pci_acs_enabled(dma_pdev, REQ_ACS_FLAGS))
4137        swap_pci_ref(&dma_pdev,
4138                 pci_get_slot(dma_pdev->bus,
4139                      PCI_DEVFN(PCI_SLOT(dma_pdev->devfn),
4140                      0)));
4141
4142    /*
4143     * Devices on the root bus go through the iommu. If that's not us,
4144     * find the next upstream device and test ACS up to the root bus.
4145     * Finding the next device may require skipping virtual buses.
4146     */
4147    while (!pci_is_root_bus(dma_pdev->bus)) {
4148        struct pci_bus *bus = dma_pdev->bus;
4149
4150        while (!bus->self) {
4151            if (!pci_is_root_bus(bus))
4152                bus = bus->parent;
4153            else
4154                goto root_bus;
4155        }
4156
4157        if (pci_acs_path_enabled(bus->self, NULL, REQ_ACS_FLAGS))
4158            break;
4159
4160        swap_pci_ref(&dma_pdev, pci_dev_get(bus->self));
4161    }
4162
4163root_bus:
4164    group = iommu_group_get(&dma_pdev->dev);
4165    pci_dev_put(dma_pdev);
4166    if (!group) {
4167        group = iommu_group_alloc();
4168        if (IS_ERR(group))
4169            return PTR_ERR(group);
4170    }
4171
4172    ret = iommu_group_add_device(group, dev);
4173
4174    iommu_group_put(group);
4175    return ret;
4176}
4177
4178static void intel_iommu_remove_device(struct device *dev)
4179{
4180    iommu_group_remove_device(dev);
4181}
4182
4183static struct iommu_ops intel_iommu_ops = {
4184    .domain_init = intel_iommu_domain_init,
4185    .domain_destroy = intel_iommu_domain_destroy,
4186    .attach_dev = intel_iommu_attach_device,
4187    .detach_dev = intel_iommu_detach_device,
4188    .map = intel_iommu_map,
4189    .unmap = intel_iommu_unmap,
4190    .iova_to_phys = intel_iommu_iova_to_phys,
4191    .domain_has_cap = intel_iommu_domain_has_cap,
4192    .add_device = intel_iommu_add_device,
4193    .remove_device = intel_iommu_remove_device,
4194    .pgsize_bitmap = INTEL_IOMMU_PGSIZES,
4195};
4196
4197static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
4198{
4199    /*
4200     * Mobile 4 Series Chipset neglects to set RWBF capability,
4201     * but needs it:
4202     */
4203    printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
4204    rwbf_quirk = 1;
4205
4206    /* https://bugzilla.redhat.com/show_bug.cgi?id=538163 */
4207    if (dev->revision == 0x07) {
4208        printk(KERN_INFO "DMAR: Disabling IOMMU for graphics on this chipset\n");
4209        dmar_map_gfx = 0;
4210    }
4211}
4212
4213DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4214
4215#define GGC 0x52
4216#define GGC_MEMORY_SIZE_MASK (0xf << 8)
4217#define GGC_MEMORY_SIZE_NONE (0x0 << 8)
4218#define GGC_MEMORY_SIZE_1M (0x1 << 8)
4219#define GGC_MEMORY_SIZE_2M (0x3 << 8)
4220#define GGC_MEMORY_VT_ENABLED (0x8 << 8)
4221#define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
4222#define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
4223#define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
4224
4225static void __devinit quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4226{
4227    unsigned short ggc;
4228
4229    if (pci_read_config_word(dev, GGC, &ggc))
4230        return;
4231
4232    if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4233        printk(KERN_INFO "DMAR: BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4234        dmar_map_gfx = 0;
4235    } else if (dmar_map_gfx) {
4236        /* we have to ensure the gfx device is idle before we flush */
4237        printk(KERN_INFO "DMAR: Disabling batched IOTLB flush on Ironlake\n");
4238        intel_iommu_strict = 1;
4239       }
4240}
4241DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4242DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4243DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4244DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4245
4246/* On Tylersburg chipsets, some BIOSes have been known to enable the
4247   ISOCH DMAR unit for the Azalia sound device, but not give it any
4248   TLB entries, which causes it to deadlock. Check for that. We do
4249   this in a function called from init_dmars(), instead of in a PCI
4250   quirk, because we don't want to print the obnoxious "BIOS broken"
4251   message if VT-d is actually disabled.
4252*/
4253static void __init check_tylersburg_isoch(void)
4254{
4255    struct pci_dev *pdev;
4256    uint32_t vtisochctrl;
4257
4258    /* If there's no Azalia in the system anyway, forget it. */
4259    pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4260    if (!pdev)
4261        return;
4262    pci_dev_put(pdev);
4263
4264    /* System Management Registers. Might be hidden, in which case
4265       we can't do the sanity check. But that's OK, because the
4266       known-broken BIOSes _don't_ actually hide it, so far. */
4267    pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4268    if (!pdev)
4269        return;
4270
4271    if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4272        pci_dev_put(pdev);
4273        return;
4274    }
4275
4276    pci_dev_put(pdev);
4277
4278    /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4279    if (vtisochctrl & 1)
4280        return;
4281
4282    /* Drop all bits other than the number of TLB entries */
4283    vtisochctrl &= 0x1c;
4284
4285    /* If we have the recommended number of TLB entries (16), fine. */
4286    if (vtisochctrl == 0x10)
4287        return;
4288
4289    /* Zero TLB entries? You get to ride the short bus to school. */
4290    if (!vtisochctrl) {
4291        WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4292             "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4293             dmi_get_system_info(DMI_BIOS_VENDOR),
4294             dmi_get_system_info(DMI_BIOS_VERSION),
4295             dmi_get_system_info(DMI_PRODUCT_VERSION));
4296        iommu_identity_mapping |= IDENTMAP_AZALIA;
4297        return;
4298    }
4299    
4300    printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4301           vtisochctrl);
4302}
4303

Archive Download this file



interactive