1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
|
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index 8169e8b7a4dc..12915511be61 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -305,48 +305,13 @@ ENTRY(startup_64)
/* Set up the stack */
leaq boot_stack_end(%rbx), %rsp
- /*
- * paging_prepare() and cleanup_trampoline() below can have GOT
- * references. Adjust the table with address we are running at.
- *
- * Zero RAX for adjust_got: the GOT was not adjusted before;
- * there's no adjustment to undo.
- */
- xorq %rax, %rax
-
- /*
- * Calculate the address the binary is loaded at and use it as
- * a GOT adjustment.
- */
- call 1f
-1: popq %rdi
- subq $1b, %rdi
-
- call adjust_got
-
/*
* At this point we are in long mode with 4-level paging enabled,
- * but we might want to enable 5-level paging or vice versa.
- *
- * The problem is that we cannot do it directly. Setting or clearing
- * CR4.LA57 in long mode would trigger #GP. So we need to switch off
- * long mode and paging first.
- *
- * We also need a trampoline in lower memory to switch over from
- * 4- to 5-level paging for cases when the bootloader puts the kernel
- * above 4G, but didn't enable 5-level paging for us.
- *
- * The same trampoline can be used to switch from 5- to 4-level paging
- * mode, like when starting 4-level paging kernel via kexec() when
- * original kernel worked in 5-level paging mode.
- *
- * For the trampoline, we need the top page table to reside in lower
- * memory as we don't have a way to load 64-bit values into CR3 in
- * 32-bit mode.
+ * but we want to enable 5-level paging.
*
- * We go though the trampoline even if we don't have to: if we're
- * already in a desired paging mode. This way the trampoline code gets
- * tested on every boot.
+ * The problem is that we cannot do it directly. Setting LA57 in
+ * long mode would trigger #GP. So we need to switch off long mode
+ * first.
*/
/* Make sure we have GDT with 32-bit code segment */
@@ -371,32 +336,40 @@ ENTRY(startup_64)
/* Save the trampoline address in RCX */
movq %rax, %rcx
+ /* Check if we need to enable 5-level paging */
+ cmpq $0, %rdx
+ jz lvl5
+
+ /* Clear additional page table */
+ leaq lvl5_pgtable(%rbx), %rdi
+ xorq %rax, %rax
+ movq $(PAGE_SIZE/8), %rcx
+ rep stosq
+
/*
- * Load the address of trampoline_return() into RDI.
- * It will be used by the trampoline to return to the main code.
+ * Setup current CR3 as the first and only entry in a new top level
+ * page table.
*/
- leaq trampoline_return(%rip), %rdi
+ movq %cr3, %rdi
+ leaq 0x7 (%rdi), %rax
+ movq %rax, lvl5_pgtable(%rbx)
/* Switch to compatibility mode (CS.L = 0 CS.D = 1) via far return */
pushq $__KERNEL32_CS
- leaq TRAMPOLINE_32BIT_CODE_OFFSET(%rax), %rax
+ leaq compatible_mode(%rip), %rax
pushq %rax
lretq
-trampoline_return:
+lvl5:
/* Restore the stack, the 32-bit trampoline uses its own stack */
leaq boot_stack_end(%rbx), %rsp
/*
* cleanup_trampoline() would restore trampoline memory.
*
- * RDI is address of the page table to use instead of page table
- * in trampoline memory (if required).
- *
* RSI holds real mode data and needs to be preserved across
* this function call.
*/
pushq %rsi
- leaq top_pgtable(%rbx), %rdi
call cleanup_trampoline
popq %rsi
@@ -404,21 +377,6 @@ trampoline_return:
pushq $0
popfq
- /*
- * Previously we've adjusted the GOT with address the binary was
- * loaded at. Now we need to re-adjust for relocation address.
- *
- * Calculate the address the binary is loaded at, so that we can
- * undo the previous GOT adjustment.
- */
- call 1f
-1: popq %rax
- subq $1b, %rax
-
- /* The new adjustment is the relocation address */
- movq %rbx, %rdi
- call adjust_got
-
/*
* Copy the compressed kernel to the end of our buffer
* where decompression in place becomes safe.
@@ -519,6 +477,19 @@ relocated:
shrq $3, %rcx
rep stosq
+/*
+ * Adjust our own GOT
+ */
+ leaq _got(%rip), %rdx
+ leaq _egot(%rip), %rcx
+1:
+ cmpq %rcx, %rdx
+ jae 2f
+ addq %rbx, (%rdx)
+ addq $8, %rdx
+ jmp 1b
+2:
+
/*
* Do the extraction, and jump to the new kernel..
*/
@@ -537,36 +508,9 @@ relocated:
*/
jmp *%rax
-/*
- * Adjust the global offset table
- *
- * RAX is the previous adjustment of the table to undo (use 0 if it's the
- * first time we touch GOT).
- * RDI is the new adjustment to apply.
- */
-adjust_got:
- /* Walk through the GOT adding the address to the entries */
- leaq _got(%rip), %rdx
- leaq _egot(%rip), %rcx
-1:
- cmpq %rcx, %rdx
- jae 2f
- subq %rax, (%rdx) /* Undo previous adjustment */
- addq %rdi, (%rdx) /* Apply the new adjustment */
- addq $8, %rdx
- jmp 1b
-2:
- ret
-
.code32
-/*
- * This is the 32-bit trampoline that will be copied over to low memory.
- *
- * RDI contains the return address (might be above 4G).
- * ECX contains the base address of the trampoline memory.
- * Non zero RDX on return means we need to enable 5-level paging.
- */
ENTRY(trampoline_32bit_src)
+compatible_mode:
/* Set up data and stack segments */
movl $__KERNEL_DS, %eax
movl %eax, %ds
@@ -580,61 +524,33 @@ ENTRY(trampoline_32bit_src)
btrl $X86_CR0_PG_BIT, %eax
movl %eax, %cr0
- /* Check what paging mode we want to be in after the trampoline */
- cmpl $0, %edx
- jz 1f
+ /* Point CR3 to 5-level paging */
+ leal lvl5_pgtable(%ebx), %eax
+ movl %eax, %cr3
- /* We want 5-level paging: don't touch CR3 if it already points to 5-level page tables */
+ /* Enable PAE and LA57 mode */
movl %cr4, %eax
- testl $X86_CR4_LA57, %eax
- jnz 3f
- jmp 2f
-1:
- /* We want 4-level paging: don't touch CR3 if it already points to 4-level page tables */
- movl %cr4, %eax
- testl $X86_CR4_LA57, %eax
- jz 3f
-2:
- /* Point CR3 to the trampoline's new top level page table */
- leal TRAMPOLINE_32BIT_PGTABLE_OFFSET(%ecx), %eax
- movl %eax, %cr3
-3:
- /* Enable PAE and LA57 (if required) paging modes */
- movl $X86_CR4_PAE, %eax
- cmpl $0, %edx
- jz 1f
- orl $X86_CR4_LA57, %eax
-1:
+ orl $(X86_CR4_PAE | X86_CR4_LA57), %eax
movl %eax, %cr4
- /* Calculate address of paging_enabled() once we are executing in the trampoline */
- leal paging_enabled - trampoline_32bit_src + TRAMPOLINE_32BIT_CODE_OFFSET(%ecx), %eax
+ /* Calculate address we are running at */
+ call 1f
+1: popl %edi
+ subl $1b, %edi
- /* Prepare the stack for far return to Long Mode */
+ /* Prepare stack for far return to Long Mode */
pushl $__KERNEL_CS
- pushl %eax
+ leal lvl5(%edi), %eax
+ push %eax
- /* Enable paging again */
+ /* Enable paging back */
movl $(X86_CR0_PG | X86_CR0_PE), %eax
movl %eax, %cr0
lret
- .code64
-paging_enabled:
- /* Return from the trampoline */
- jmp *%rdi
-
- /*
- * The trampoline code has a size limit.
- * Make sure we fail to compile if the trampoline code grows
- * beyond TRAMPOLINE_32BIT_CODE_SIZE bytes.
- */
- .org trampoline_32bit_src + TRAMPOLINE_32BIT_CODE_SIZE
-
- .code32
no_longmode:
- /* This isn't an x86-64 CPU, so hang intentionally, we cannot continue */
+ /* This isn't an x86-64 CPU so hang */
1:
hlt
jmp 1b
@@ -695,10 +611,5 @@ boot_stack_end:
.balign 4096
pgtable:
.fill BOOT_PGT_SIZE, 1, 0
-
-/*
- * The page table is going to be used instead of page table in the trampoline
- * memory.
- */
-top_pgtable:
+lvl5_pgtable:
.fill PAGE_SIZE, 1, 0
diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c
index a362fa0b849c..32af1cbcd903 100644
--- a/arch/x86/boot/compressed/pgtable_64.c
+++ b/arch/x86/boot/compressed/pgtable_64.c
@@ -22,6 +22,14 @@ struct paging_config {
/* Buffer to preserve trampoline memory */
static char trampoline_save[TRAMPOLINE_32BIT_SIZE];
+/*
+ * The page table is going to be used instead of page table in the trampoline
+ * memory.
+ *
+ * It must not be in BSS as BSS is cleared after cleanup_trampoline().
+ */
+static char top_pgtable[PAGE_SIZE] __aligned(PAGE_SIZE) __section(.data);
+
/*
* Trampoline address will be printed by extract_kernel() for debugging
* purposes.
@@ -126,7 +134,7 @@ struct paging_config paging_prepare(void)
return paging_config;
}
-void cleanup_trampoline(void *pgtable)
+void cleanup_trampoline(void)
{
void *trampoline_pgtable;
@@ -137,8 +145,8 @@ void cleanup_trampoline(void *pgtable)
* if it's there.
*/
if ((void *)__native_read_cr3() == trampoline_pgtable) {
- memcpy(pgtable, trampoline_pgtable, PAGE_SIZE);
- native_write_cr3((unsigned long)pgtable);
+ memcpy(top_pgtable, trampoline_pgtable, PAGE_SIZE);
+ native_write_cr3((unsigned long)top_pgtable);
}
/* Restore trampoline memory */
|