Annotation of /trunk/kernel26-magellan/patches-2.6.16-r12/0119-2.6.16.12-4GB-pte_clear.patch
Parent Directory | Revision Log
Revision 72 -
(hide annotations)
(download)
Mon Jun 5 09:25:38 2006 UTC (18 years, 3 months ago) by niro
File size: 5415 byte(s)
Mon Jun 5 09:25:38 2006 UTC (18 years, 3 months ago) by niro
File size: 5415 byte(s)
ver bump to 2.6.16-r12: - updated to linux-2.6.16.19 - updated to ck11
1 | niro | 72 | From: Zachary Amsden <zach@vmware.com> |
2 | Date: Thu, 27 Apr 2006 20:01:39 +0000 (+0000) | ||
3 | Subject: [PATCH] x86/PAE: Fix pte_clear for the >4GB RAM case | ||
4 | X-Git-Url: http://www.kernel.org/git/?p=linux/kernel/git/stable/linux-2.6.16.y.git;a=commitdiff;h=b00f098c1467ee11260b5178d08ed793c720fc0c | ||
5 | |||
6 | [PATCH] x86/PAE: Fix pte_clear for the >4GB RAM case | ||
7 | |||
8 | Proposed fix for ptep_get_and_clear_full PAE bug. Pte_clear had the same bug, | ||
9 | so use the same fix for both. Turns out pmd_clear had it as well, but pgds | ||
10 | are not affected. | ||
11 | |||
12 | The problem is rather intricate. Page table entries in PAE mode are 64-bits | ||
13 | wide, but the only atomic 8-byte write operation available in 32-bit mode is | ||
14 | cmpxchg8b, which is expensive (at least on P4), and thus avoided. But it can | ||
15 | happen that the processor may prefetch entries into the TLB in the middle of an | ||
16 | operation which clears a page table entry. So one must always clear the P-bit | ||
17 | in the low word of the page table entry first when clearing it. | ||
18 | |||
19 | Since the sequence *ptep = __pte(0) leaves the order of the write dependent on | ||
20 | the compiler, it must be coded explicitly as a clear of the low word followed | ||
21 | by a clear of the high word. Further, there must be a write memory barrier | ||
22 | here to enforce proper ordering by the compiler (and, in the future, by the | ||
23 | processor as well). | ||
24 | |||
25 | On > 4GB memory machines, the implementation of pte_clear for PAE was clearly | ||
26 | deficient, as it could leave virtual mappings of physical memory above 4GB | ||
27 | aliased to memory below 4GB in the TLB. The implementation of | ||
28 | ptep_get_and_clear_full has a similar bug, although not nearly as likely to | ||
29 | occur, since the mappings being cleared are in the process of being destroyed, | ||
30 | and should never be dereferenced again. | ||
31 | |||
32 | But, as luck would have it, it is possible to trigger bugs even without ever | ||
33 | dereferencing these bogus TLB mappings, even if the clear is followed fairly | ||
34 | soon after with a TLB flush or invalidation. The problem is that memory above | ||
35 | 4GB may now be aliased into the first 4GB of memory, and in fact, may hit a | ||
36 | region of memory with non-memory semantics. These regions include AGP and PCI | ||
37 | space. As such, these memory regions are not cached by the processor. This | ||
38 | introduces the bug. | ||
39 | |||
40 | The processor can speculate memory operations, including memory writes, as long | ||
41 | as they are committed with the proper ordering. Speculating a memory write to | ||
42 | a linear address that has a bogus TLB mapping is possible. Normally, the | ||
43 | speculation is harmless. But for cached memory, it does leave the falsely | ||
44 | speculated cacheline unmodified, but in a dirty state. This cache line will be | ||
45 | eventually written back. If this cacheline happens to intersect a region of | ||
46 | memory that is not protected by the cache coherency protocol, it can corrupt | ||
47 | data in I/O memory, which is generally a very bad thing to do, and can cause | ||
48 | total system failure or just plain undefined behavior. | ||
49 | |||
50 | These bugs are extremely unlikely, but the severity is of such magnitude, and | ||
51 | the fix so simple that I think fixing them immediately is justified. Also, | ||
52 | they are nearly impossible to debug. | ||
53 | |||
54 | Signed-off-by: Zachary Amsden <zach@vmware.com> | ||
55 | Signed-off-by: Linus Torvalds <torvalds@osdl.org> | ||
56 | Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de> | ||
57 | --- | ||
58 | |||
59 | --- a/include/asm-i386/pgtable-2level.h | ||
60 | +++ b/include/asm-i386/pgtable-2level.h | ||
61 | @@ -18,6 +18,9 @@ | ||
62 | #define set_pte_atomic(pteptr, pteval) set_pte(pteptr,pteval) | ||
63 | #define set_pmd(pmdptr, pmdval) (*(pmdptr) = (pmdval)) | ||
64 | |||
65 | +#define pte_clear(mm,addr,xp) do { set_pte_at(mm, addr, xp, __pte(0)); } while (0) | ||
66 | +#define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0) | ||
67 | + | ||
68 | #define ptep_get_and_clear(mm,addr,xp) __pte(xchg(&(xp)->pte_low, 0)) | ||
69 | #define pte_same(a, b) ((a).pte_low == (b).pte_low) | ||
70 | #define pte_page(x) pfn_to_page(pte_pfn(x)) | ||
71 | --- a/include/asm-i386/pgtable-3level.h | ||
72 | +++ b/include/asm-i386/pgtable-3level.h | ||
73 | @@ -85,6 +85,26 @@ static inline void pud_clear (pud_t * pu | ||
74 | #define pmd_offset(pud, address) ((pmd_t *) pud_page(*(pud)) + \ | ||
75 | pmd_index(address)) | ||
76 | |||
77 | +/* | ||
78 | + * For PTEs and PDEs, we must clear the P-bit first when clearing a page table | ||
79 | + * entry, so clear the bottom half first and enforce ordering with a compiler | ||
80 | + * barrier. | ||
81 | + */ | ||
82 | +static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) | ||
83 | +{ | ||
84 | + ptep->pte_low = 0; | ||
85 | + smp_wmb(); | ||
86 | + ptep->pte_high = 0; | ||
87 | +} | ||
88 | + | ||
89 | +static inline void pmd_clear(pmd_t *pmd) | ||
90 | +{ | ||
91 | + u32 *tmp = (u32 *)pmd; | ||
92 | + *tmp = 0; | ||
93 | + smp_wmb(); | ||
94 | + *(tmp + 1) = 0; | ||
95 | +} | ||
96 | + | ||
97 | static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) | ||
98 | { | ||
99 | pte_t res; | ||
100 | --- a/include/asm-i386/pgtable.h | ||
101 | +++ b/include/asm-i386/pgtable.h | ||
102 | @@ -204,12 +204,10 @@ extern unsigned long long __PAGE_KERNEL, | ||
103 | extern unsigned long pg0[]; | ||
104 | |||
105 | #define pte_present(x) ((x).pte_low & (_PAGE_PRESENT | _PAGE_PROTNONE)) | ||
106 | -#define pte_clear(mm,addr,xp) do { set_pte_at(mm, addr, xp, __pte(0)); } while (0) | ||
107 | |||
108 | /* To avoid harmful races, pmd_none(x) should check only the lower when PAE */ | ||
109 | #define pmd_none(x) (!(unsigned long)pmd_val(x)) | ||
110 | #define pmd_present(x) (pmd_val(x) & _PAGE_PRESENT) | ||
111 | -#define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0) | ||
112 | #define pmd_bad(x) ((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE) | ||
113 | |||
114 | |||
115 | @@ -269,7 +267,7 @@ static inline pte_t ptep_get_and_clear_f | ||
116 | pte_t pte; | ||
117 | if (full) { | ||
118 | pte = *ptep; | ||
119 | - *ptep = __pte(0); | ||
120 | + pte_clear(mm, addr, ptep); | ||
121 | } else { | ||
122 | pte = ptep_get_and_clear(mm, addr, ptep); | ||
123 | } |