libgloss/arm/cpu-init/rdimon-aem.S


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530

/* Copyright (c) 2005-2013 ARM Ltd.  All rights reserved.

 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions
 are met:
 1. Redistributions of source code must retain the above copyright
    notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
    notice, this list of conditions and the following disclaimer in the
    documentation and/or other materials provided with the distribution.
 3. The name of the company may not be used to endorse or promote
    products derived from this software without specific prior written
    permission.

 THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
 WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
 MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
 TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */

/* This file gives a basic initialisation of a Cortex-A series core.  It is
   the bare minimum required to get Cortex-A core running with a semihosting
   interface.

   It sets up a basic 1:1 phsyical address to virtual address mapping;
   turns the MMU on; enables branch prediction; activates any integrated
   caches; enables the Advanced SIMD and VFP co-processors; and installs
   basic exception handlers.

   It does not handle peripherals, and assumes all memory is Normal.

   It does not change processor state from the startup privilege and security
   level.

   This has only been tested to work in ARM state.

   By default it assumes exception vectors are located from address 0.
   However, if this is not true they can be moved by defining the
   _rdimon_vector_base symbol.  For example if you have HIVECS enabled you
   may pass --defsym _rdimon_vector_base=0xffff0000 on the linker command
   line.  */

   /* __ARM_ARCH_PROFILE is defined from GCC 4.8 onwards, however __ARM_ARCH_7A
	has been defined since 4.2 onwards, which is when v7-a support was added
	and hence 'A' profile support was added in the compiler.  Allow for this
	file to be built with older compilers.  */
#if defined(__ARM_ARCH_7A__) || (__ARM_ARCH_PROFILE == 'A')
    .syntax	unified
    .arch	armv7-a
    .arm

    @ CPU Initialisation
    .globl	_rdimon_hw_init_hook
    .type	_rdimon_hw_init_hook, %function

_rdimon_hw_init_hook:
    @ Only run the code on CPU 0 - otherwise spin
    mrc         15, 0, r4, cr0, cr0, 5  @ Read MPIDR
    ands        r4, r4, #15
spin:
    bne spin

    mov         r10, lr			@ Save LR for final return

#ifdef __ARMEB__
    @ Setup for Big Endian
    setend      be
    mrc         15, 0, r4, cr1, cr0, 0  @ Read SCTLR
    orr         r4, r4, #(1<<25)        @ Switch to Big Endian (Set SCTLR.EE)
    mcr         15, 0, r4, cr1, cr0, 0  @ Write SCTLR
#else
    @ Setup for Little Endian
    setend      le
    mrc         15, 0, r4, cr1, cr0, 0  @ Read SCTLR
    bic         r4, r4, #(1<<25)        @ Switch to LE (unset SCTLR.EE)
    mcr         15, 0, r4, cr1, cr0, 0  @ Write SCTLR
#endif

    bl          is_a15_a7

    @ For Cortex-A15 and Cortex-A7 only:
    @ Write zero into the ACTLR to turn everything on.
    itt		eq
    moveq       r4, #0
    mcreq       15, 0, r4, c1, c0, 1
    isb

    @ For Cortex-A15 and Cortex-A7 only:
    @ Set ACTLR:SMP bit before enabling the caches and MMU,
    @ or performing any cache and TLB maintenance operations.
    ittt	eq
    mrceq       15, 0, r4, c1, c0, 1    @ Read ACTLR
    orreq       r4, r4, #(1<<6)         @ Enable ACTLR:SMP
    mcreq       15, 0, r4, c1, c0, 1    @ Write ACTLR
    isb

    @ Setup for exceptions being taken to Thumb/ARM state
    mrc         15, 0, r4, cr1, cr0, 0	@ Read SCTLR
#if defined(__thumb__)
    orr         r4, r4, #(1 << 30)	@ Enable SCTLR.TE
#else
    bic         r4, r4, #(1 << 30)      @ Disable SCTLR.TE
#endif
    mcr         15, 0, r4, cr1, cr0, 0  @ Write SCTLR

    bl          __reset_caches

    mrc         15, 0, r4, cr1, cr0, 0  @ Read SCTLR
    orr         r4, r4, #(1<<22)        @ Enable unaligned mode
    bic         r4, r4, #2              @ Disable alignment faults
    bic         r4, r4, #1              @ Disable MMU
    mcr         15, 0, r4, cr1, cr0, 0  @ Write SCTLR

    mov         r4, #0
    mcr         15, 0, r4, cr8, cr7, 0  @ Write TLBIALL - Invaliidate unified
                                        @ TLB
    @ Setup MMU Primary table P=V mapping.
    mvn         r4, #0
    mcr         15, 0, r4, cr3, cr0, 0  @ Write DACR

    mov         r4, #0                  @ Always use TTBR0, no LPAE
    mcr         15, 0, r4, cr2, cr0, 2  @ Write TTBCR
    adr         r4, page_table_addr	@ Load the base for vectors
    ldr         r4, [r4]
    add         r4, r4, #1		@ Page tables inner cacheable

    mcr         15, 0, r4, cr2, cr0, 0  @ Write TTBR0

    mov         r0, #34 @ 0x22          @ TR0 and TR1 - normal memory
    orr         r0, r0, #(1 << 19)      @ Shareable
    mcr         15, 0, r0, cr10, cr2, 0 @ Write PRRR
    movw        r0, #0x33
    movt        r0, #0x33
    mcr         15, 0, r0, cr10, cr2, 1 @ Write NMRR
    mrc         15, 0, r0, cr1, cr0, 0  @ Read SCTLR
    bic         r0, r0, #(1 << 28)      @ Clear TRE bit
    mcr         15, 0, r0, cr1, cr0, 0  @ Write SCTLR

    @ Now install the vector code - we move the Vector code from where it is
    @ in the image to be based at _rdimon_vector_base.  We have to do this copy
    @ as the code is all PC-relative.  We actually cheat and do a BX <reg> so
    @ that we are at a known address relatively quickly and have to move as
    @ little code as possible.
    mov         r7, #(VectorCode_Limit - VectorCode)
    adr         r5, VectorCode
    adr         r6, vector_base_addr	@ Load the base for vectors
    ldr         r6, [r6]

copy_loop:                              @ Do the copy
    ldr         r4, [r5], #4
    str         r4, [r6], #4
    subs        r7, r7, #4
    bne         copy_loop

    mrc         15, 0, r4, cr1, cr0, 0  @ Read SCTLR
    bic         r4, r4, #0x1000         @ Disable I Cache
    bic         r4, r4, #4              @ Disable D Cache
    orr         r4, r4, #1              @ Enable MMU
    bic         r4, r4, #(1 << 28)      @ Clear TRE bit
    mcr         15, 0, r4, cr1, cr0, 0  @ Write SCTLR
    mrc         15, 0, r4, cr1, cr0, 2  @ Read CPACR
    orr         r4, r4, #0x00f00000     @ Turn on VFP Co-procs
    bic         r4, r4, #0x80000000     @ Clear ASEDIS bit
    mcr         15, 0, r4, cr1, cr0, 2  @ Write CPACR
    isb
    mov         r4, #0
    mcr         15, 0, r4, cr7, cr5, 4  @ Flush prefetch buffer
    mrc         15, 0, r4, cr1, cr0, 2  @ Read CPACR
    ubfx        r4, r4, #20, #4		@ Extract bits [20, 23)
    cmp         r4, #0xf		@ If not all set then the CPU does not
    itt		eq			@ have FP or Advanced SIMD.
    moveq       r4, #0x40000000		@ Enable FP and Advanced SIMD
    mcreq       10, 7, r4, cr8, cr0, 0  @ vmsr  fpexc, r4
skip_vfp_enable:
    bl          __enable_caches         @ Turn caches on
    bx		r10                     @ Return to CRT startup routine

    @ This enable us to be more precise about which caches we want
init_cpu_client_enable_dcache:
init_cpu_client_enable_icache:
    mov         r0, #1
    bx          lr

vector_base_addr:
    .word       _rdimon_vector_base
    .weak       _rdimon_vector_base
page_table_addr:
    .word       page_tables

    @ Vector code - must be PIC and in ARM state.
VectorCode:
    b           vector_reset
    b           vector_undef
    b           vector_swi
    b           vector_prefetch
    b           vector_dataabt
    b           vector_reserved
    b           vector_irq
    b           vector_fiq

vector_reset:
    adr         sp, vector_sp_base
    push        {r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, sl, fp, ip, lr}
    mov         r4, #0
    b           vector_common
vector_undef:
    adr         sp, vector_sp_base
    push        {r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, sl, fp, ip, lr}
    mov         r4, #1
    b           vector_common
vector_swi:
    adr         sp, vector_sp_base
    push        {r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, sl, fp, ip, lr}
    mov         r4, #2
    b           vector_common
vector_prefetch:
    adr         sp, vector_sp_base
    push        {r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, sl, fp, ip, lr}
    mov         r4, #3
    b           vector_common
vector_dataabt:
    adr         sp, vector_sp_base
    push        {r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, sl, fp, ip, lr}
    mov         r4, #4
    b           vector_common
vector_reserved:
    adr         sp, vector_sp_base
    push        {r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, sl, fp, ip, lr}
    mov         r4, #5
    b           vector_common
vector_irq:
    adr         sp, vector_sp_base
    push        {r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, sl, fp, ip, lr}
    mov         r4, #6
    b           vector_common
vector_fiq:
    adr         sp, vector_sp_base
    push        {r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, sl, fp, ip, lr}
    mov         r4, #7
    b           vector_common
vector_common:
    adr         r1, vector_common_adr   @ Find where we're going to
    ldr         r1, [r1]
    bx          r1                      @ And branch there
vector_common_adr:
   .word        vector_common_2         @ Common handling code

                                        @ Vector stack
   .p2align       3                       @ Align to 8 byte boundary boundary to
					@ keep ABI compatibility
   .fill        32, 4, 0                @ 32-entry stack is enough for vector
					@ handlers.
vector_sp_base:
VectorCode_Limit:
    @ End of PIC code for vectors

    @ Common Handling of vectors
    .type	vector_common_2, %function
vector_common_2:
    mrs         r1, APSR
    mrs         r2, SPSR
    push        {r1, r2}                @ Save PSRs

    @ Output the vector we have caught
    bl          out_nl
    adr         r0, which_vector
    bl          out_string
    adr         r0, vector_names
    mov         r1, #11
    mla         r0, r4, r1, r0
    bl          out_string
    bl          out_nl

    @ Dump the registers
    adrl        r6, register_names
    mov         r7, #0
dump_r_loop:
    mov         r0, r6
    bl          out_string
    add         r6, r6, #6
    ldr         r0, [sp, r7, lsl #2]
    bl          out_word
    bl          out_nl
    add         r7, r7, #1
    cmp         r7, #16
    blt         dump_r_loop
    adr         r0, end
    bl          out_string

    @ And exit
    mov         r0, #24
    orr         r1, r4, #0x20000
    svc         0x00123456

    @ Output the string in r0
out_string:
    push        {lr}
    mov         r1, r0
    mov         r0, #4
    svc         0x00123456
    pop         {pc}

    @ Output a New-line
out_nl:
    mov r0, #10
    @ Fallthrough

    @ Output the character in r0
out_char:
    push        {lr}
    strb        r0, [sp, #-4]!
    mov         r0, #3
    mov         r1, sp
    svc         0x00123456
    add         sp, sp, #4
    pop         {pc}

    @ Output the value of r0 as a hex-word
out_word:
    push        {r4, r5, r6, lr}
    mov         r4, r0
    mov         r5, #28
    adr         r6, hexchars
word_loop:
    lsr         r0, r4, r5
    and         r0, r0, #15
    ldrb        r0, [r6, r0]
    bl          out_char
    subs        r5, r5, #4
    bpl         word_loop
    pop         {r4, r5, r6, pc}

hexchars:
    .ascii	"0123456789abcdef"

which_vector:
    .asciz	"Hit vector:"
end:
    .asciz	"End.\n"

vector_names:
    .asciz	"reset     "
    .asciz	"undef     "
    .asciz	"swi       "
    .asciz	"prefetch  "
    .asciz	"data abort"
    .asciz	"reserved  "
    .asciz	"irq       "
    .asciz	"fiq       "

register_names:
    .asciz	"apsr "
    .asciz	"spsr "
    .asciz	"r0   "
    .asciz	"r1   "
    .asciz	"r2   "
    .asciz	"r3   "
    .asciz	"r4   "
    .asciz	"r5   "
    .asciz	"r6   "
    .asciz	"r7   "
    .asciz	"r8   "
    .asciz	"r9   "
    .asciz	"r10  "
    .asciz	"r11  "
    .asciz	"r12  "
    .asciz	"r14  "

    .p2align      3


    @ Enable the caches
__enable_caches:
    mov         r0, #0
    mcr         15, 0, r0, cr8, cr7, 0  @ Invalidate all unified-TLB
    mov         r0, #0
    mcr         15, 0, r0, cr7, cr5, 6  @ Invalidate branch predictor
    mrc         15, 0, r4, cr1, cr0, 0  @ Read SCTLR
    orr         r4, r4, #0x800          @ Enable branch predictor
    mcr         15, 0, r4, cr1, cr0, 0  @ Set SCTLR
    mov         r5, lr                  @ Save LR as we're going to BL
    mrc         15, 0, r4, cr1, cr0, 0  @ Read SCTLR
    bl          init_cpu_client_enable_icache
    cmp         r0, #0
    it		ne
    orrne       r4, r4, #0x1000         @ Enable I-Cache
    bl          init_cpu_client_enable_dcache
    cmp         r0, #0
    it		ne
    orrne       r4, r4, #4
    mcr         15, 0, r4, cr1, cr0, 0  @ Enable D-Cache
    bx          r5                      @ Return

__reset_caches:
    mov         ip, lr                  @ Save LR
    mov         r0, #0
    mcr         15, 0, r0, cr7, cr5, 6  @ Invalidate branch predictor
    mrc         15, 0, r6, cr1, cr0, 0  @ Read SCTLR
    mrc         15, 0, r0, cr1, cr0, 0  @ Read SCTLR!
    bic         r0, r0, #0x1000         @ Disable I cache
    mcr         15, 0, r0, cr1, cr0, 0  @ Write SCTLR
    mrc         15, 1, r0, cr0, cr0, 1  @ Read CLIDR
    tst         r0, #3                  @ Harvard Cache?
    mov         r0, #0
    it		ne
    mcrne       15, 0, r0, cr7, cr5, 0  @ Invalidate Instruction Cache?

    mrc         15, 0, r1, cr1, cr0, 0  @ Read SCTLR (again!)
    orr         r1, r1, #0x800          @ Enable branch predictor

                                        @ If we're not enabling caches we have
                                        @ no more work to do.
    bl          init_cpu_client_enable_icache
    cmp         r0, #0
    it		ne
    orrne       r1, r1, #0x1000         @ Enable I-Cache now -
                                        @ We actually only do this if we have a
                                        @ Harvard style cache.
    it		eq
    bleq        init_cpu_client_enable_dcache
    itt		eq
    cmpeq       r0, #0
    beq         Finished1

    mcr         15, 0, r1, cr1, cr0, 0  @ Write SCTLR (turn on Branch predictor & I-cache)

    mrc         15, 1, r0, cr0, cr0, 1  @ Read CLIDR
    ands        r3, r0, #0x7000000
    lsr         r3, r3, #23             @ Total cache levels << 1
    beq         Finished1

    mov         lr, #0                  @ lr = cache level << 1
Loop11:
    mrc         15, 1, r0, cr0, cr0, 1  @ Read CLIDR
    add         r2, lr, lr, lsr #1      @ r2 holds cache 'set' position
    lsr         r1, r0, r2              @ Bottom 3-bits are Ctype for this level
    and         r1, r1, #7              @ Get those 3-bits alone
    cmp         r1, #2
    blt         Skip1                   @ No cache or only I-Cache at this level
    mcr         15, 2, lr, cr0, cr0, 0  @ Write CSSELR
    mov         r1, #0
    isb         sy
    mrc         15, 1, r1, cr0, cr0, 0  @ Read CCSIDR
    and         r2, r1, #7              @ Extract line length field
    add         r2, r2, #4              @ Add 4 for the line length offset (log2 16 bytes)
    movw        r0, #0x3ff
    ands        r0, r0, r1, lsr #3      @ r0 is the max number on the way size
    clz         r4, r0                  @ r4 is the bit position of the way size increment
    movw        r5, #0x7fff
    ands        r5, r5, r1, lsr #13     @ r5 is the max number of the index size (right aligned)
Loop21:
    mov r7, r0                          @ r7 working copy of max way size
Loop31:
    orr         r1, lr, r7, lsl r4      @ factor in way number and cache number
    orr         r1, r1, r5, lsl r2      @ factor in set number
    tst         r6, #4                  @ D-Cache on?
    ite         eq
    mcreq       15, 0, r1, cr7, cr6, 2  @ No - invalidate by set/way
    mcrne       15, 0, r1, cr7, cr14, 2 @ yes - clean + invalidate by set/way
    subs        r7, r7, #1              @ Decrement way number
    bge         Loop31
    subs        r5, r5, #1              @ Decrement set number
    bge         Loop21
Skip1:
    add         lr, lr, #2              @ increment cache number
    cmp         r3, lr
    bgt         Loop11
Finished1:
    @ Now we know the caches are clean we can:
    mrc         15, 0, r4, cr1, cr0, 0  @ Read SCTLR
    bic         r4, r4, #4              @ Disable D-Cache
    mcr         15, 0, r4, cr1, cr0, 0  @ Write SCTLR
    mov         r4, #0
    mcr         15, 0, r4, cr7, cr5, 6  @ Write BPIALL

    bx          ip                      @ Return

    @ Set Z if this is a Cortex-A15 or Cortex_A7
    @ Other flags corrupted
is_a15_a7:
    mrc         15, 0, r8, c0, c0, 0
    movw        r9, #0xfff0
    movt        r9, #0xff0f
    and         r8, r8, r9
    movw        r9, #0xc0f0
    movt        r9, #0x410f
    cmp         r8, r9
    movw        r9, #0xc070
    movt        r9, #0x410f
    it		ne
    cmpne       r8, r9
    bx          lr

    @ Descriptor type: Section
    @ Bufferable: True
    @ Cacheable: True
    @ Execute Never: False
    @ Domain: 0
    @ Impl. Defined: 0
    @ Access: 0/11 Full access
    @ TEX: 001
    @ Shareable: False
    @ Not Global: False
    @ Supersection: False
#define PT(X) \
    .word	X;
#define PT2(X) \
    PT(X)  PT(X + 0x100000)    PT(X + 0x200000)    PT(X + 0x300000)
#define PT3(X) \
    PT2(X) PT2(X + 0x400000)   PT2(X + 0x800000)   PT2(X + 0xc00000)
#define PT4(X) \
    PT3(X) PT3(X + 0x1000000)  PT3(X + 0x2000000)  PT3(X + 0x3000000)
#define PT5(X) \
    PT4(X) PT4(X + 0x4000000)  PT4(X + 0x8000000)  PT4(X + 0xc000000)
#define PT6(X) \
    PT5(X) PT5(X + 0x10000000) PT5(X + 0x20000000) PT5(X + 0x30000000)
#define PT7(X) \
    PT6(X) PT6(X + 0x40000000) PT6(X + 0x80000000) PT6(X + 0xc0000000)

    .section    page_tables_section, "aw", %progbits
    .p2align    14
page_tables:
     PT7(0x1c0e)

#endif //#if defined(__ARM_ARCH_7A__) || __ARM_ARCH_PROFILE == 'A'