diff --git a/mmu/Makefile b/mmu/Makefile index 537efe6..ce2dc2a 100644 --- a/mmu/Makefile +++ b/mmu/Makefile @@ -30,9 +30,3 @@ notmain.hex : memmap novectors.o periph.o notmain.o $(ARMGNU)-objdump -D notmain.elf > notmain.list $(ARMGNU)-objcopy notmain.elf -O ihex notmain.hex - - - - - - diff --git a/mmu/README b/mmu/README deleted file mode 100644 index 5924ff0..0000000 --- a/mmu/README +++ /dev/null @@ -1,863 +0,0 @@ - -See the top level README file for more information on documentation -and how to run these programs. - -This example demonstrates ARM MMU basics. - -You will need the ARM ARM (ARM Architectural Reference Manual) for -ARMv5. I have a couple of pages included in this repo, but you still -will need the ARM ARM. - -This code so far does not work on the Raspberry pi 2 yet, will get -that working at some point, the knowledge here still applies, I expect -the differences to be subtle between ARMv6 and 7 but will see. - - - --- NEED TO RE-WRITE THIS AGAIN, SUBPAGES ENABLED, COARSE 1KB TABLES -- - - - - -So what an MMU does or at least what an MMU does for us is it -translates virtual addresses into physical addresses as well as -checking access permissions, and gives us control over cachable -regions. - -So what does all of that mean? - -There is a boundary inside the chip around the ARM core, part of that -boundary is the memory interface for the ARM for lack of a better term -how the ARM accesses the world. Nothing special, all processors have -some sort of address and data based interface between the processor and -the ram and peripherals. That boundary uses physical addresses, that -boundary is on the memory side or "world side" of the ARM's mmu. -Within the ARM core there is the "processor side" of the mmu, and all -load and store (and fetch) accesses to the world go through the mmu. - -When the ARM powers up the mmu is disabled, which means all accesses -pass through unmodified making the "processor side" or virtual address -space equal to the world side physical address space. All of my -examples thus far, blinkers and such are based on physical addresses. -We already know that elswhere in the chip is another address -translation of some sort, because the manual is written for 0x7Exxxxxx -based adresses, but the ARM's physical addresses for those same things -is 0x20xxxxxx for the raspi 1 and 0x3Fxxxxxx for the raspi 2. For this -discussion we only care about that other mystery address translation -we care about the ARM and the ARM mmu. - -So when I say the mmu translates virtual addresses into physical -addresses. What that means is on the processor side there is an address -you are accessing, but that does not have to be the same address on -the physical address side of the mmu. Lets say for example I am -running a program on an operating system, Linux lets say, and I need -to compile that program before I can use it and I need to link it for -an address space so lets say that I link it to enter at address 0x8000 -and use memory from 0x0000 to whatever I need and/or whatever is -available. So that is all fine, except what if I have two programs -and I want both running "at the same time" how can both use the same -address space without clobbering each other? The answer is neither is -at that address space the virtual address WHEN RUNNING one of them is -in the virtual address space 0x00000000 to some number, but in reality -program 1 might have that mapped to the physical address 0x01000000 and -program 2 might have its 0x00000000 to some number mapped to 0x02000000. -So when program 1 thinks it is writing to address 0xABCDE it is really -writing to 0x010ABCDE and when program 2 thinks it is writing to -address 0xABCDE it is really writing to 0x020ABCDE. - -If you think about it it doesnt make any sense to allow any virtual -address to map to any physical address, for example from 0x12345678 -to 0xAABBCCDD. Think about it, we are talking about a 32 bit address -space or 4Giga addresses. If we allowed any address to convert to -any other address we would need a 4Giga to 4Giga map, we would actually -need 16Gigabytes just to hold the 4Giga physical adresses worst case. -To cut to the chase ARM has one option where the top 12 bits of the -virtual get translated to 12 bits of physical, the lower 20 bits in -that case are the same between the virtual and physical. This means -we can control 1MByte of address space with one definition, and have -4096 entries in some table somewhere to convert from virtual to -physical. That is quite managable. The minimum we would need to -store are the 12 replacement bits per table entry, but ARM uses a full -32 bit entry, which for this 1MB flavor, has the 12 physical bits plus -some other control bits. - -What does cachable regions mean? The mmu also gives you the feature -of being able to choose per descriptor whether or not you want to -enable caching on that block. One obvious reason would be for the -peripherals. Think about a timer, ideally you read the current timer -tick and each time you read it you get the current timer tick and -as it changes you see it change. But what if when we turned on the -data cache it covered all addresses, all loads and stores? Then you -read the timer once, get a value, read it again, now you get the -cached value over and over again you dont see the real timer value -in the peripheral. That is not good, you cannot manage a peripheral -if you cannot read its status register or read the data coming out -of it, etc. So at a minimum your peripherals need to be in non-cached -blocks. Likewise, if you have some ram that is shared by more than -one resource, say the GPU and the ARM or for the raspberry pi 2 shared -between multiple ARM cores, you have a similar situation, another -resource may change the ram on the far side of your cache but your -cache assumes it has a copy of what is in ram. Basically a cache -only helps you if whatever on the far side of it is only modified by -writes through the cache, if there are ways to change the data on -the far side you should not cache that area. The mmu gives you -the ability to control cached and non-cahced spaces. - -What is meant by access permissions? Lets think about those two -programs running "at the same time" on some operating system (Linux -for example) you dont want to allow one program to gain access to -the operating systems data nor some other programs data. Some -operating systems sure that are meant for only running trusted and -well mannered programs. But you dont want some video game on your -home computer to have access to your banking account data in another -window/program? The mechanisms vary across processor families but -an important job for the mmu is to provide a protection mechanism. -Such that when a particular program has a time slice on the processor -there is some mechanism to allow or restrict memory spaces. If some -code accesses an address that it does not have permission for then -an abort happens and the processor is notified. An interesting -side effect of this is that this doesnt have to be fatal, in fact it -could be by design. Think of a virtual machine, you could let the -virtual machine software run on the processor, and when it accesses -one of its peripherals the real operating system gets an abort but -instead of killing the virtual machine it actually simulates the -peripheral and lets the virtual machine keep running. Another one -that you have probably run into is when you run out of ram in your -computer, the notion of virtual memory which is differen than virtual -address space. Virtual memory in this case is when your program -ventures off the end of its allowed address space into ram it thinks -it has. The operating system gets an abort, finds some ram from -some other program, swaps that ram to disk for example, then allows -the program that was running to have a little more ram by mapping it -back in and allowing it to run. Later when the program whose data -got swapped to disk needs it it swaps back and whatever was in the -ram it swaps with then goes to disk. The term swap comes from the -idea that these blocks of ram are swapped back and forth to disk, -program A's ram goes to disk and is swapped with program T's, then -program T's is swapped with program K's and so on. This is why -starting right after you venture off that edge from real ram to -virtual, your computers performance drops dramatically and disk -activity goes way up, the more things running the more swapping going -on and disk is significantly slower than ram. - -As with all baremetal programming, wading through documentation is -the bulk of the job. Definitely true here, with the unfortunate -problem that ARM's docs dont all look the same from one Archtectural -Reference Manual to an other. We have this other problem that we -are techically using an ARMv6 (architecture version 6)(for the raspi 1) -but when you go to ARM's website there is an ARMv5 and then ARMv7 and -ARMv8, but no ARMv6. Well the ARMv5 manual is actually the original -ARM ARM, that I assume they realized couldnt maintain all the -architecture variations forever in one document, so they perhaps -wisely went to one ARM ARM per rev. With respect to the MMU, the ARMv5 -reference manual covers the ARMv4 (I didnt know there was an mmu option -there) ARMv5 and ARMv6, and there is mode such that you can have the -same code/tables and it works on all three, meaning you dont have to -if-then-else your code based on whatever architecture you find. This -raspi 1 example is based on subpages enabled which is this legacy or -compatibility mode across the three. - -I am mostly using the ARMv5 Architectural Reference Manual. -ARM DDI0100I. - -The 1MB sections mentioned above are called...sections...The ARM -mmu also has blobs that are smaller sizes 4096 byte pages for -example, will touch on those two sizes. The 4096 byte one is called -a small page. - -As mentioned above, 32 bit address space, 1MB is 20 bits so 32-20 is -12 bits or 4096 possible combinations or the address space is broken -up into 4096 1MB sections. The top 12 bits of the virtual address -get translated to 12 bits of physical. No rules on the translation -you can have virtual = physical or have any combination, or have -a bunch of virtual sections point at the same physical space, whatever -you want/need. - -ARM uses the term Virtual Memory System Architecture or VMSA and -they say things like VMSAv6 to talk about the ARMv6 VMSA. There -is a section in the ARM ARM titled Virtual Memory System Architecture. -In there we see the coprocessor registers, specifically CP15 register -2 is the translation table base register. - - -So the ARMv5 ARM ARM (ARM Architectural Reference Manual) is what -we need now. See the top level README for finding this document, -I have included a few pages in the form of postscript, any decent pdf -viewer should be able to handle these files. Before the pictures -though, the section in quesiton is titled Virtual Memory System -Architecture. In the CP15 subsection register 2 is the the translation -table base register. There are three opcodes which give us access to -three things, TTBR0, TTBR1 and the control register. - -First we read this comment - -If N = 0 always use TTBR0. When N = 0 (the reset case), the translation -table base is backwards compatible with earlier versions of the -architecture. - -That is the one we want, we will leave that as N = 0 and not touch it -and use TTBR0 - -Now what the TTBR0 description initially is telling me that bit 31 -down to 14-n or 14 in our case since n = 0 is the base address, in -PHYSICAL address space. Note the mmu cannot possibly go through the -mmu to figure out how to go through the mmu, the mmu itself only -operates on physical space and has direct access to it. In a second -we are going to see that we need the base address for the mmu table -to be aligned to 16384 bytes. (2 to the power 14, the lower 14 bits -of our TLB base address needs to be all zeros). - -We write that register using - - mcr p15,0,r0,c2,c0,0 ;@ tlb base - -TLB = Translation Lookaside Buffer. As far as we are concerned think -of it as an array of 32 bit integers, each integer (descriptor) being -used to completely or partially convert from virtual to physical and -describe permissions and caching. - -My example is going to have a define called MMUTABLEBASE which will -be where we start our TLB table. - -Here is the reality of the world. Some folks struggle with bit -manipulation, orring and anding and shifting and such, some dont. The -MMU is logic so it operates on these tables in the way that logic would, -meaning from a programmers perspective it is a lot of bit manipulation -but otherwise is relatively simple to something a program could do. As -programmers we need to know how the logic uses portsion of the virtual -address to look into this descriptor table or TLB, and then extracts -from those bits the next thing it needs to do. We have to know this so -that for a particular virtual address we can place the descriptor we -want in the place where the hardware is going to find it. So we need -a few lines of code plus some basic understanding of what is going on. -Just like bit manipulation causes some folks to struggle, reading -a chapter like this mmu chapter is equally daunting. It is nice to -have somehone hold your hand through it. Hopefully I am doing more -good than bad in that respect. - -There is a file, section_translation.ps in this repo, you should be -able to use a pdf viewer to open this file. The figure on the -second page shows just the address translation from virtual to physical -for a 1MB section. This picture uses X instead of N, we are using an -N = 0 so that means X = 0. The translation table base at the top -of the diagram is our MMUTABLEBASE, the address in physical space -of the beginning of our first level TLB or descriptor table. The -first thing we need to do is find the table entry for the virtual -address in question (the Modified virtual address in this diagram, -as far as we are concerned it is unmodified it is the virtual -address we intend to use). The first thing we see is the lower -14 bits of the translation table base are SBZ = should be zero. -Basically we need to have the translation table base aligned on a -16Kbyte boundary (2 to the 14th is 16K). It would not make sense -to use all zeros as the translation table base, we have our reset -and interrupt vectors at and near address zero in the arms address -space so the first sane address would be 0x00004000. The first -level descriptor is based on the top 12 bits of the virtual address -or 4096 entries, that is 16KBytes (not a coincidence), 0x4000 + 0x4000 -is 0x8000, where our arm programs entry point is, so we have space -there if we want to use it. But any address with the lower 14 bits -being zero will work so long as you have enough memory at that address -and you are not clobbering anything else that is using that memory -space. - -So what this picture is showing us is that we take the top 12 bits -of the virtual address, multiply by 4 or shift left 2, and add tat -to the translation table base, this gives the address for the first -level descriptor for that virtual address. The diagram shows the -first level fetch which returns a 32 bit value that we have placed -in the table. If the lower 2 bits of that first level descriptor are -0b10 then this is a 1MB Section. If a 1MB section then the top 12 -bits of the first level descriptor replace the top 12 bits of the -virtual address to convert it into a physical address. Understand -here first and foremost so long as we do the N = 0 thing, the first -level descriptor or the first thing the mmu does is look at the top -12 bits of the virtual address, always. If the lower two bits of -the first level descriptor are not 0b10 then we get into -a second level descriptor and more virtual bits come into play, but -for now if we start by learning just 1MB sections, the conversion -from virtual to physical only cares about the top 12 bits of the -address. So for 1MB sections we dont have to concentrate on every -actual address we are going to access we only need to think about -the 1MB aligned ranges. The uart for example on the raspi 1 has -a number of registers that start with 0x202150xx, if we use a 1MB -section for those we only care about the 0x202xxxxx part of the -address. To not have to change our code we would want to have -the virtual = physical for that and do not mark it as cacheable. - -So if my MMUTABLEBASE was 0x00004000 and I had a virtual address of -0x12345678 then the hardware is going to take the top 12 bits of that -address 0x123, multiply by 4 and add that to the MMUTABLEBASE. -0x4000+(0x123<<2) = 0x448C. and that is the address the mmu is going -to use for the first-level lookup. Ignoring the other bits in the -descriptor for now, if the first-level descriptor has the value -0xABC00002, the lower two bits are 0x10, a 1MB section, so the top -12 bits replace the virtual addresses top 12 bits and our 0x12345678 -is converted to the physical address 0xABC45678. - - -Now they have this optional thing called a supersection which is a 16MB -sized thing rather than 1MB and one might think that that would make -life easier, right? Wrong. No matter what, assuming the N = 0 thing -the first level descriptor is found using the top 12 bits of the -virtual address, so in order to do some 16MB thing you need 16 entries -one for each of the possible 1MB sections. If you are already -generating 16 descriptors might as well just make them 1MB sections, -you can read up on the differences between super sections and sections -and try them if you want. For what I am doing here dont need them, -just wanted to point out you still need 16 entries per super section. - -Hopefully I have not lost you yet with this address manipulation, -and maybe you are one step ahead of me, yes EVERY load and store with -the mmu enabled requires at least one mmu table lookup, the mmu when it -accesses this memory does not go through itself, but EVERY other fetch -and load and store. Which does have a performance hit, they do have -a bit of a cache in the mmu to store the last so many tlb lookups. -That helps, but you cannot avoid the mmu having to do the conversion -on every address. - -In the ARM ARM I am looking at the subsection on first-level descriptors -has a table: -Table B4-1 First-level descriptor format (VMSAv6, subpages enabled) -What this is telling us is that if the first-level descriptor, the -32 bit number we place in the right place in the TLB, has the lower -two bits 0b10 then that entry defines a 1MB section and the mmu can get -everything it needs from that first level descriptor. But if the -lower two bits are 0b01 then this is a coarse page table entry and -we have to go to a second level descriptor to complete the -conversion from virtual to physical. Not every address will need -this only the address ranges we want to be more coarsely divided than -1MB. Or the other way of saying it is of we want to control an -address range in chunks smaller than 1MB then we need to use pages -not sections. You can certainly use pages for the whole world, but -if you do the math, 4096Byte pages would mean your mmu table needs -to be 4MB+16K worst case. And you have to do more work to set that -all up. - -The coarse_translation.ps file I have included in this repo starts -off the same way as a section, has to the logic doesnt know what -you want until it sees the first level descriptor. If it sees a -0b01 as the lower 2 bits of the first level descriptor then this is -a coarse page table entry and it needs to do a second level fetch. -The second level fetch does not use the mmu tlb table base address -bits 31:10 of the second level address plus bits 19:12 of the -virtual address (times 4) are where the second level descriptor lives. -Note that is 8 more bits so the section is divided into 256 parts, this -page table address is similar to the mmu table address, but it needs -to be aligned on a 1K boundry (lower 10 bits zeros) and can be worst -case 1KBytes in size. - -The second level descriptor format defined in the ARM ARM (small pages -are most interesting here, subpages enabled) is a little different -than a first level section, we had a domain in the first level -descriptor to get here, but now have direct access to four sets of -AP bits you/I would have to read more to know what the difference -is between the domain defined AP and these additional four, for now -I dont care this is bare metal, set them to full access (0b11) and -move on (see below about domain and ap bits). - -So lets take the virtual address 0x12345678 and the MMUTABLEBASE of -0x4000 again. The first level descriptor address is the top three -bits of the virtual address 0x123, times 4, added to the MMUTABLEBASE -0x448C. But this time when we look it up we find a value in the -table that has the lower two bits being 0b01. Just to be crazy lets -say that descriptor was 0xABCDE001 (ignoring the domain and other -bits just talking address right now). That means we take 0xABCDE000 -the picture shows bits 19:12 (0x45) of the virtual address (0x12345678) -so the address to the second level descriptor in this crazy case is -0xABCDE000+(0x45<<2) = 0xABCDE114 why is that crazy? because I -chose an address where we in theory dont have ram on the raspberry pi -maybe a mirrored address space, but a sane address would have been -somewhere close to the MMUTABLEBASE so we can keep the whole of the -mmu tables in a confined area. Used this address simply for -demonstration purposes not based on a workable solution. - -The "other" bits in the descriptors are the domain, the TEX bits, -the C and B bits, domain and AP. - -The C bit is the simplest one to start with that means Cacheable. For -peripherals we absolutely dont want them to be cached. For ram, maybe. - -The b bit, means bufferable, as in write buffer. Something you may -not have heard about or thought about ever. It is kind of like a cache -on the write end of things instead of read end. I digress, when -a processor writes something everything is known, the address and -data. So the next level of logic, could, if so designed, accept -that address and data at that level and release the processor to -keep doing what it was doing (ideally fetch some more instructions -and keep running) in parallel that logic could then continue to perform -the write to the slower peripheral or really slow dram (or faster cache). -Giving us a small to large performance gain. But, what happens if while -we are doing that first write another write happens. Well if we only -have storage for one transaction in this little feature then the -processor has to wait for us to finish the first write however long -that takes, then we can grab the information for the second write and -then release the processor. I call writes "fire and forget" because -ideally the processor hands off the info to the memory controller -and keeps going, the memory controller has all the info it needs to -complete the task. For a read the processor needs that data back so -basically has to wait. Well a write buffer can store up to some number -of addresses and data. It can still fill up and have to hold the -processor off. But it is similar to a cache is to reading, it has -some faster ram that stages writes so the processor, sometimes, can -keep on going. - -Now the TEX bits you just have to look up and there is the rub there -are likely more than one set of tables for TEX C and B, I am going -to stick with a TEX of 0b000 and not mess with any fancy features -there. Now depending on whether this is considered an older arm -(ARMv5) or an ARMv6 or newer the combination of TEX, C and B have -some subtle differences. The cache bit in particular does enable -or disable this space as cacheable. That simply asserts bits on -the AMDA/AXI (memory) bus that marks the transaction as cacheable, -you still need a cache and need it setup and enabled for the -transaction to actually get cached. If you dont have the cache for -that transaction type enabled then it just does a normal memory (or -peripheral) operation. So we set TEX to zeros to keep it out of the -way. - -Lastly the domain and AP bits. Now you will see a 4 bit domain thing -and a 2 bit domain thing. These are related. There is a register in -the MMU right next to the translation table base address register this -one is a 32 bit register that contains 16 different domain definitions. - -The two bit domain controls are defined as such (these are AP bits) - -0b00 No access Any access generates a domain fault -0b01 Client Accesses are checked against the access permission bits in the TLB entry -0b10 Reserved Using this value has UNPREDICTABLE results -0b11 Manager Accesses are not checked against the access permission bits in the TLB -entry, so a permission fault cannot be generated - -For starters we are going to set all of the domains to 0b11 dont check -cant fault. What are these 16 domains though? Notice it takes 4 bits -to describe one of 16 things. The different domains have no specific -meaning other than that we can have 16 different definitions that we -control for whatever reason. You might allow for 16 different -threads running at once in your operating system, or 16 different -types of software running (kernel, application, ...) you can mark -a bunch of sections as belonging to one parituclar domain, and with a -simple change to that domain control register, a whole domain might -go from one type of permission to another, from no checking to -no access for example. By just writing this domain register you can -quickly change what address spaces have permission and which ones dont -without necessarily changing the mmu table. - -Since I usually use the MMU in bare metal to enable data caching on ram -I set my domain controls to 0b11, no checking and I simply make all -the MMU sections domain number 0. - -So we end up with this simple function that allows us to add first level -descriptors in the MMU translation table. - -unsigned int mmu_section ( unsigned int vadd, unsigned int padd, unsigned int flags ) -{ - unsigned int ra; - unsigned int rb; - unsigned int rc; - - ra=vadd>>20; - rb=MMUTABLEBASE|(ra<<2); - ra=padd>>20; - rc=(ra<<20)|flags|2; - PUT32(rb,rc); - return(0); -} - -So what you have to do to turn on the MMU is to first figure out all -the memory you are going to access, and make sure you have entries -for that. This is important, if you forget something, and dont have -a valid entry there, then you fault, your fault handler, if you have -chosen to write it, may also fault if it isnt placed write or something -it accesses also faults...(I would assume the fault handler is also -behind the mmu but would have to read up on that). - -So the smallest amount of ram on a raspi is 256MB or 0x10000000 bytes. - -Our program enters at address 0x8000, so that is within the first -section 0x000xxxxx so we should make that section cacheable and -bufferable. - - mmu_section(0x00000000,0x00000000,0x0000|8|4); - -This is saying map the virtual 0x000xxxxx to the physical 0x000xxxxx -enable the cache and write buffer. 0x8 is the C bit and 0x4 is the B -bit. tex, domain, etc are zeros. - -If we want to use all 256mb we would need to do this for all the -sections from 0x000xxxxx to 0x100xxxxx. Maybe do that later. - -We know that for the raspi1 the peripherals, uart and such are in -arm physical space at 0x20xxxxxx. To allow for more ram on the raspi 2 -they needed to move that and moved it to 0x3Fxxxxxx. So we either need -16 1MB section sized entries to cover that whole range or we look at -specific sections for specific things we care to talk to and just add -those. The uart and the gpio it is associated with is in the 0x202xxxxx -space. There are a couple of timers in the 0x200xxxxx space so one -entry can cover those. - -if we didnt want to allow those to be cached or write buffered then - - mmu_section(0x20000000,0x20000000,0x0000); //NOT CACHED! - mmu_section(0x20200000,0x20200000,0x0000); //NOT CACHED! - mmu_section(0x3F000000,0x3F000000,0x0000); //NOT CACHED! - mmu_section(0x3F200000,0x3F200000,0x0000); //NOT CACHED! - -but we may play with that to demonstrate what caching a peripheral -can do to you, why we need to turn on the mmu if for no other reason -than to get some bare metal performance by using the d cache. - -Now you have to think on a system level here, there are a number -of things in play. We need to plan our memory space, where are we -putting the MMU table, where are our peripherals, where is our program. - -If the only reason for using the mmu is to allow the use of the d cache -then just map the whole world virtual = physical if you want with the -peripherals not cached and the rest cached. - -If you are on the raspi 2 with multiple arm cores and are using -the multiple arm cores you need to do more reading if you want one -core to talk to another by sharing some of the memory between -them. Same problem as peripherals basically with multiple masters -of the ram/peripheral on the far side of my cache, how do I insure -what is in my cache maches the far side? Easiest way is to not -cache that space. You need to read up on if the cores share a cache -or have their own (or if l2 if present is shared but l1 is not), -ldrex/strex were implemented specifically for multi core, but you -need to understand the cache effects on these instructions ( -not documented well, I have an example on just this one topic). - -So once our tables are setup then we need to actually turn the -MMU on. Now I cant figure out where I got this from, and I have -modified it in this repo. According to this manual it was with the -ARMv6 that we got the DSB feature which says wait for either cache -or MMU to finish something before continuing. In particular when -initializing a cache to start it up you want to clean out all the -entries in a safe way you dont want to evict them and hose memory -you want to invalidate everything, mark it such that the cache lines -are empty/available. Likewise that little bit of TLB caching the MMU -has, we want to invalidate that too so we dont start up the mmu -with entries in there that dont match our entries. - -Why are we invalidating the cache in mmu init code? Because first we -need the mmu to use the d cache (to protect the peripherals from -being cached) and second the controls that enable the mmu are in the -same register as the i and d controls so it made sense to do both -mmu and cache stuff in one function. - -So after the DSB we set our domain control bits, now in this example -I have done something different, 15 of the 16 domains have the 0b11 -setting which is dont fault on anything, manager mode. I set domain -1 such that it has no access, so in the example I will change one -of the descriptor table entries to use domain one, then I will access -it and then see the access violation. I am also programming both -translation table base addresses even though we are using the N = 0 -mode and only one is needed. Depends on which manual you read I guess -as to whether or not you see the N = 0 and the separate or shared -i and d mmu tables. (the reason for two is if you want your i and -d address spaces to be managed separately). - -Understand I have been running on ARMv6 systems without the DSB and it -just works, so maybe that is dumb luck... - -This code relies on the caller to pass in the MMU enable and I and D -cache enables. This is because this is derived from code where -sometimes I turn things on or dont turn things on and wanted it -generic. - - -.globl start_MMU -start_MMU: - mov r2,#0 - mcr p15,0,r2,c7,c7,0 ;@ invalidate caches - mcr p15,0,r2,c8,c7,0 ;@ invalidate tlb - mcr p15,0,r2,c7,c10,4 ;@ DSB ?? - - mvn r2,#0 - bic r2,#0xC - mcr p15,0,r2,c3,c0,0 ;@ domain - - mcr p15,0,r0,c2,c0,0 ;@ tlb base - mcr p15,0,r0,c2,c0,1 ;@ tlb base - - mrc p15,0,r2,c1,c0,0 - orr r2,r2,r1 - mcr p15,0,r2,c1,c0,0 - - bx lr - -I am going to mess with the translation tables after the MMU is started -so the easiest way to deal with the TLB cache is to invalidate it, but -dont need to mess with main L1 cache. ARMv6 introduces a feature to -help with this, but going with this solution. - -.globl invalidate_tlbs -invalidate_tlbs: - mov r2,#0 - mcr p15,0,r2,c8,c7,0 ;@ invalidate tlb - mcr p15,0,r2,c7,c10,4 ;@ DSB ?? - bx lr - -Something to note here. Debugging using the JTAG based on chip debugger -makes life easier, that removing sd cards or the old days pulling an -eeprom out and putting it it in an eraser then a programmer. BUT, -it is not completely without issue. When and where and if you hit this -depends heavily on the core you are using and the jtag tools and the -commands you remember/prefer. The basic problem is caches can and -often do separate instruction I fetches from data D reads and writes. -So if you have test run A of a program that has executed the instruction -at address 0xD000. So that instruction is in the I cache. You have -also executed the instruction at 0xC000 but it has been evicted, but -you dont actually know what is in the I cache or not, shouldnt even -try to assume. You stop the processor, you write a new program to -memory, now these are data D writes, and go through the D cache. Then -you set the start address and run again. Now there are a number of -combinations here and only one if them works, the rest can lead to -failure. - -For each instruction/address in the program, if the prior instruction -at that address was in the i cache, and since data writes do not go -through the i cache then the new instruction for that address is either -in the d cache or in main ram. When you run the new program you will -get the stale/old instruction from a prior run when you fetch that -address (unless an invalidate happens, if a flush happens then you -write back, but why would an I cache flush?), and if the new instruction -at that address is not the same as the old one unpredictable results -will occur. You can start to see the combinations, did the data -write go through to d cache or to ram, will it flush to ram and is the -i cache invalid for that address, etc. - -There is also the quesiton of are the I and D caches shared, they can -be but that is both specific to the core and your setup. Also does -the jtag debugger have the ability to disable the caches, has it done -it for you, can you do it manually. - -Any time you are using the i or d caches you need to be careful using -a jtag debugger or even a bootloader type approach depending on its -design as you might end up doing data writes of instructions and going -around the i cache or worse. So for this kind of work using a chip -reset and non volitle rom/flash based bootloader can/will save you -a lot of headaches. If you know your debugger is solving this for you, -great, but always make sure as you change from the raspi 2 back to -a raspi 1 for example it might not be doing it and it will drive you -nuts when you keep downloading a new program and it either crashes -in a strange way or simply just keeps running the old program and -not appearing to take your new changes. - -So the example is going to start with the mmu off and write to -addresses in four different 1MB address spaces. So that later we -can play with the section descriptors and demonstrate virtual to -physical address conversion. - -So write some stuff and print it out on the uart. - - PUT32(0x00045678,0x00045678); - PUT32(0x00145678,0x00145678); - PUT32(0x00245678,0x00245678); - PUT32(0x00345678,0x00345678); - - hexstring(GET32(0x00045678)); - hexstring(GET32(0x00145678)); - hexstring(GET32(0x00245678)); - hexstring(GET32(0x00345678)); - uart_send(0x0D); uart_send(0x0A); - -then setup the mmu with at least those four sections and the peripherals - - mmu_section(0x00000000,0x00000000,0x0000|8|4); - mmu_section(0x00100000,0x00100000,0x0000); - mmu_section(0x00200000,0x00200000,0x0000); - mmu_section(0x00300000,0x00300000,0x0000); - //peripherals - mmu_section(0x20000000,0x20000000,0x0000); //NOT CACHED! - mmu_section(0x20200000,0x20200000,0x0000); //NOT CACHED! - -and start the mmu with the I and D caches enabled - - start_mmu(MMUTABLEBASE,0x00000001|0x1000|0x0004); - -then if we read those four addresses again we get the same output -as before since we maped virtual = physical. - - hexstring(GET32(0x00045678)); - hexstring(GET32(0x00145678)); - hexstring(GET32(0x00245678)); - hexstring(GET32(0x00345678)); - uart_send(0x0D); uart_send(0x0A); - -but what if we swizzle things around. make virtual 0x001xxxxx = -physical 0x003xxxxx. 0x002 looks at 0x000 and 0x003 looks at 0x001 -(dont mess with the 0x00000000 section, that is where our program is -running) - - mmu_section(0x00100000,0x00300000,0x0000); - mmu_section(0x00200000,0x00000000,0x0000); - mmu_section(0x00300000,0x00100000,0x0000); - -and maybe we dont need to do this but do it anyway just in case - - invalidate_tlbs(); - -read them again. - - hexstring(GET32(0x00045678)); - hexstring(GET32(0x00145678)); - hexstring(GET32(0x00245678)); - hexstring(GET32(0x00345678)); - uart_send(0x0D); uart_send(0x0A); - -the 0x000xxxxx entry was not modifed so we get 000045678 as the output -but the 0x001xxxxx read is now coming from physical 0x003xxxxx so we -get the 00345678 output, 0x002xxxxx comes from the 0x000xxxxx space -so that read gives 00045678 and the 0x003xxxxx is mapped to 0x001xxxxx -physical giving 00145678 as the output. - -So up to this point the output looks like this. - -DEADBEEF -00045678 -00145678 -00245678 -00345678 - -00045678 -00145678 -00245678 -00345678 - -00045678 -00345678 -00045678 -00145678 - -first blob is without the mmu enabled, second with the mmu but -virtual = physical, third we use the mmu to show virtual != physical -for some ranges. - -Now for some small pages, I made this function to help out. - -unsigned int mmu_small ( unsigned int vadd, unsigned int padd, unsigned int flags, unsigned int mmubase ) -{ - unsigned int ra; - unsigned int rb; - unsigned int rc; - - ra=vadd>>20; - rb=MMUTABLEBASE|(ra<<2); - rc=(mmubase&0xFFFFFC00)/*|(domain<<5)*/|1; - //hexstrings(rb); hexstring(rc); - PUT32(rb,rc); //first level descriptor - ra=(vadd>>12)&0xFF; - rb=(mmubase&0xFFFFFC00)|(ra<<2); - rc=(padd&0xFFFFF000)|(0xFF0)|flags|2; - //hexstrings(rb); hexstring(rc); - PUT32(rb,rc); //second level descriptor - return(0); -} - -So before turning on the mmu some physical addresses were written -with some data. The function takes the virtual, physical, flags and -where you want the secondary table to be. Remember secondary tables -can be up to 1K in size and are aligned on a 1K boundary. - - - mmu_small(0x0AA45000,0x00145000,0,0x00000400); - mmu_small(0x0BB45000,0x00245000,0,0x00000800); - mmu_small(0x0CC45000,0x00345000,0,0x00000C00); - mmu_small(0x0DD45000,0x00345000,0,0x00001000); - mmu_small(0x0DD46000,0x00146000,0,0x00001000); - //put these back - mmu_section(0x00100000,0x00100000,0x0000); - mmu_section(0x00200000,0x00200000,0x0000); - mmu_section(0x00300000,0x00300000,0x0000); - invalidate_tlbs(); - -Now why did I use different secondary table addresses most of the -time but not all of the time? A secondary table lookup is the same -first level descriptor for the top 12 bits of the address, if the -top 12 bits of the address are different it is a different secondary -table. So to demonstrate that we actually have separation within a -section I have two small pages within a 1MB section that I point -at two different physical address spaces. So in short if the top -12 bits of the virtual address are the same then they share the same -coarse page table, the way the function works it writes both first -and second level descriptors so if you were to do this - - mmu_small(0x0DD45000,0x00345000,0,0x00001000); - mmu_small(0x0DD46000,0x00146000,0,0x00001400); - -Then both of those virtual addresses would go to the 0x1400 table, and -the first virtual address would not have a secondary entry its -secondary entry would be in a table at 0x1000 but the first level -no longer points to 0x1000 so the mmu would get whatever it finds -in the 0x1400 table. - - -The last example is just demonstrating an access violation. Changing -the domain to that one domain we did not set full access to - - //access violation. - - mmu_section(0x00100000,0x00100000,0x0020); - invalidate_tlbs(); - - hexstring(GET32(0x00045678)); - hexstring(GET32(0x00145678)); - hexstring(GET32(0x00245678)); - hexstring(GET32(0x00345678)); - uart_send(0x0D); uart_send(0x0A); - -The first 0x45678 read comes from that first level descriptor, with -that domain - -00045678 -00000010 - -How do I know what that means with that output. Well from my blinker07 -example we touched on exceptions (interrupts). I made a generic test -fixture such that anything other than a reset prints something out -and then hangs. In no way shape or form is this a complete handler -but what it does show is that it is the exception that is at address -0x00000010 that gets hit which is data abort. So figuring out it was -a data abort (pretty much expected) have that then read the data fault -status registers, being a data access we expect the data/combined one -to show somthing and the instruction one to not. Adding that -instrumentation resulted in. - -00045678 -00000010 -00000019 -00000000 -00008110 -E5900000 -00145678 - -Now I switched to the ARM1176JZF-S Technical Reference Manual for more -detail and that shows the 0x01 was domain 1, the domain we used for -that access. then the 0x9 means Domain Section Fault. - -The lr during the abort shows us the instruction, which you would need -to disassemble to figure out the address, or at least that is one -way to do it perhaps there is a status register for that. - -The instruction and the address match our expectations for this fault. - -This is simply a basic intro. Just enough to be dangerous. The MMU -is one of the simplest peripherals to program so long as bit -manipulation is not something that causes you to lose sleep. What makes -it hard is that if you mess up even one bit, or forget even one thing -you can crash in spectacular ways (often silently without any way of -knowing what happened). Debugging can be hard at best. - -The ARM ARM indicates that the ARMv6 adds the feature of separating -the I and D from an mmu perspective which is an interesting thought -(see the jtag debugging comments, and think about how this can affect -you re-loading a program into ram and running) you have enough ammo -to try that. The ARMv7 doesnt seem to have a legacy mode yet, still -reading, the descriptors and how they are addresses looks basically -the same but this code doesnt yet work on the raspi 2, so I will -continue to work on that and update this repo when I figure it out. - - - - - diff --git a/mmu/README.md b/mmu/README.md new file mode 100644 index 0000000..8b66fcd --- /dev/null +++ b/mmu/README.md @@ -0,0 +1,886 @@ +# MMU on Raspberry Pi + +See the top level README file for more information on documentation +and how to run these programs. + +## Preface + +This example demonstrates ARM MMU basics. + +You will need the [ARM ARM] (ARM Architectural Reference Manual) for +ARMv5. + +This code so far does not work on the Raspberry pi 2 yet, will get +that working at some point, the knowledge here still applies, I expect +the differences to be subtle between ARMv6 and 7 but will see. + +## Fundamentals + +A Memory Managment Unit (MMU) translates virtual addresses into physical +addresses, as well as checking access permissions and giving control over +marking regions cacheable. This allows the programmer to identify the memory +regions which may be cached for faster access in the CPU core, but leaving out +for example hardware registers, which are mapped into memory. + +There is a boundary inside the chip around the ARM-core. The ARM-core itself +uses virtual addresses for memory and hardware-accesses, which are translated +by the MMU to physical addresses when actually requesting a value in memory. +Every access to the memory or the "world side" has to go through the MMU. + +When the ARM-core powers up the MMU is disable, which means that every access +will pass through unmodified, making the virtual addresses (processor side) +equal to the physical addresses (world side). All of the example thus far in +this repository (e.g. blinkers) are based in physical addresses. + +We already know that somewhere else in the chip the used addresses are +different. The Raspberry Pi manual is written for 0x7Exxxxxx based addresses, +but for the ARM's physical addresses for the same things is 0x20xxxxxx for the +Raspberry Pi 2 and 0x3Fxxxxxx for the Raspberry Pi 2. For this discussion we +only care about the ARM and the ARM MMU, not for the other mystical translation +on the chip. + +### Motivation + +Let's say I am programming a program for let's say Linux. I would have to link +my program to use specific addresses (or a specific address space). Let's +assume, that our program is loaded into 0x8000 and it can use the memory from +0x0000 onwards. That would be fine for one program, but let's say another +program wants to be loaded to 0x8000 or maybe use this space as memory for +variables. So how can we run several program without the risk of them +clobbering each other? + +The answer is neither is actually loaded into 0x8000 when running. The programs +may assume, that they can use the addresses like stated above, but in reality +the addresses when requesting memory cells will be translated by the MMU. So +one program could be placed to 0x10008000, the other one at 0x20008000. When +program 1 thinks it accesses 0x0000abcd it is really accessing 0x1000abcd, the +other one 0x2000abcd. This translation is completely transparent to the +programs, i.e. they will never notice, that the addresses are translated for +them. + +Theoretically you could assign every virtual address a physical address to be +translated to, but that does not make much sense. The ARM-core used on the +Raspberry Pi is a 32-bit processor, i.e. it uses 32-bit addresses. This means +we have 4 Giga (2^32) addresses. A table containing the physical addresses +alone would be 16 GB big. + +The ARM has one option to translate the top 12 bits of the virtual address to +the top 12 bits of the physical address, leaving the lower 20 bits as they are +between the virtual and physical space. This means we can control 1 MB of +address space per definition and have 4096 entries in a table somewhere to +convert virtual to physical addresses. The ARM still uses all 32 Bits, 12 for +the top 12 address bits, the other ones as control flags. One of them indicates +whether a region is marked as cacheable. + +### About caching + +A cache is a (very small but) very fast memory inside the processor. It is used +by the processor transparently to remember data which is loaded and/or stored +by your program together with its address. This behaviour saves the processor +from having to request the value from RAM every time it is needed, having to +wait for the (slower) memory on every read/write. Caching can vastly increase +the speed of your program. Changes to values are written through the cache. + +But why is it disabled with the MMU disabled? Let's assume we want to read the +value register of a timer. This is done by reading from a specific address. +What we want would be, that we get the current value every time we read the +register. When caching is enabled for these memory regions we would read the +current value one time, but after that we will only get the cached value. This +is no good, because you cannot control peripherals if you are unable to get the +current state or value of a peripheral, because the cache only gives you the +last (old) values. + +Likewise, if you have some RAM, which is shared by more than one resource, like +the GPU and the ARM or several processor cores on the Raspberry Pi 2 or 3, you +will have a similar situation. In general you want to disable caching on every +region which can be modified by other means than through the cache. The MMU +let's you enable or disable caching on memory regions. + +### About access permissions + +Let's think back to our example with the two programs running "at the same +time". You don't want any of the programs to get access to the operating +systems data structures nor do you want any of the two modifying code or data +of the other program. You would not want a video game to get access to your +banking account open in another window, would you? + +The mechanisms vary by processor family but the MMU provides the security +mechanisms. When a particular program is running on the processor there are +mean to allow or restrict access to specific memory spaces. If some code +accesses an address it does not have the permissions for, then a Data +Abort-Exception happens, and the processor will stop running the code of that +application. The Operating System will be notified (by the means of an +Exception Handler / Interrupt Service Routine). + +This Data Abort does not have to be fatal for the application, but it could be +by design. Think of a virtual machine, running on the processor and when it +tries to access its peripherals, the real Operating System can be notified to +simulate the peripheral and keep the virtual machine running. + +### About virtual memory + +What happens when you run out of memory on your computer? Let's say the RAM is +use up completely, but an application uses for example `malloc` to request more +memory. The operating system will then find a block of memory of another +application and save that to disk. This space can then be used by the running +application as memory. When the other program then tries to use the swapped out +memory, it will trigger an Data Abort-Exception in the processor. This will +trigger the operating system to swap that memory block back into memory (maybe +substituting another block of another application). + +The term swap comes from the idea that these blocks of ram are swapped back and +forth to disk, program A's ram goes to disk and is swapped with program T's, +then program T's is swapped with program K's and so on. This is why starting +right after you venture off that edge from real RAM to virtual, your computers +performance drops dramatically and disk activity goes way up. The more things +run the more memory needs to be swapped onto the much slower disk. + +## Wading through the documentation + +I am mostly using the ARMv5 Architectural Reference Manual DDI0100I. ([ARM ARM]) + +Unfortunately the ARM ARM does not look the same from one to the next. With the +Raspberry Pi 1 and Zero we are technically using an ARMv6 (architecture version +6), but when we go to ARM's website, there is an ARMv5, ARMv7 and ARMv8-version, +but no ARMv6. The ARMv5 manual is actually the original ARM ARM, where they (I +assume) realized, that they could not maintain all the architecture variations +in one document forever. So they split them per revision. With respect to the +MMU the ARMv5 manual cover the ARMv4, ARMv5 and ARMv6. There is a mode where +you can have the same code and table to work on all three, so you don't have to +if-then-else your code based on whatever architecture you find. This example is +based on this legacy mode with subpages enables. + +The 1 MB sections mentioned above are called sections. The ARM MMU also has +blobs with a smaller size of 4096 bytes, which are called small page. I will +touch on those two sizes. + +As mentioned above the Raspberry Pi has a 32 bit address space. 1 MB sections +means 20 bits unaltered (bits 32 to 20) and 12 bits translated meaning 4096 +1 MB sections, i.e. 4096 entries in the table. The top 12 bits of the virtual +address get translated to the top 12 bits of the physical address. There are no +additional rules on the translation, you can have for example +- virtual = physical +- any combination you like +- have a bunch of virtual sections point to the same physical space. + +ARM uses the term Virtual Memory System Architecture or VMSA and +they say things like VMSAv6 to talk about the ARMv6 VMSA. There +is a section in the ARM ARM titled Virtual Memory System Architecture. +In there we see the coprocessor registers, specifically CP15 register +2 is the translation table base register. In the CP15 subsection register 2 is +the translation table base register. There are three opcodes which give us +access to three things: `TTBR0`, `TTBR1` and the control register. + +### Writing the Translation table base address + +First we read this comment (pg. 741, heading: Register 2: Translation table +base): +> If N = 0 always use `TTBR0`. When N = 0 (the reset case), the translation +> table base is backwards compatible with earlier versions of the +> architecture. + +So we want to leave N = 0 and use `TTBR0`. + +The `TTBR0`-register contains the base address in the physical address space. +The bits 31 down to 14-n (with n=0 in our case) are used as the base address. +Note that the MMU cannot go through the MMU to figure out how to go through the +MMU. It operates exclusively in physical address space and has direct access to +memory. In a second we are going to see, that the base address for the MMU table +has to be aligned to 16384 bytes (2^14), the lower 14 bits of our TLB base +address is all zeroes (TLB=Translation Look-Aside Buffer)). + +We write that register using + +```c + mcr p15,0,r0,c2,c0,0 ;@ tlb base +``` + +#### The co-processor + +Let me explain what that mnemonic does. `mcr` is a special instruction to write +to registers of the co-processor. This co-processor manages loads of functions +of the ARM-core, like unaligned data access or the MMU. You probably already +came across `msr` which is an instruction to store data into the status +register of the ARM-core (for example to set a new privilege mode or to enable +interrupts). You cannot access the status register or the co-processor +registers with the normal `mov` instruction. + +The co-processor has several registers, which can be accessed by the `mcr` or +the `mrc` instruction. The parameters are: + +```c + MCR{cond} P15,,,,, + MRC{cond} P15,,,,, +``` + +[co-processor] shows the list of registers plus their assignment to the +parameters for the two instructions. So the above statement will access the following parameters: +- Opcode_1: 0 +- Rd: r0 +- CRn: c2 (register number within CP15) +- CRm: c0 (operational register) +- Opcode_2: 0 + +r0 serves as source register of the value to be written to the Register +identified by (c2,c0,0,0). + +#### About TLB + +As far as we are concerned think of the TLB as an array of 32 bit integer, +each one being used to translate a virtual to a physical address and +describes permissions and caching. My example is going to have a define called +`MMUTABLEBASE` which will be where out TLB table starts. The TLB is used as +cache for the page tables. + +The MMU is completely realised in hardware, but you can configure it the way +you want. It will operate on the values we set into our page table with Or and +And-operations (i.e. bit-manipulations). It uses portions of the virtual +addresses to find the correct plane in the page table to find the according +physical address. From the next bits it will decide what to do next. We, as +programmers, need to know how the MMU calculates the place, so we can put our +descriptor into the correct space, so the MMU finds it. + +#### Translating virtual to physical addresses + +In the manual there is figure B4-4 (page B4-29), which shows a diagram of how +the addresses are translated. It uses X instead of N (which we want to be 0). +The modified virtual address in this diagram is, as far as we are concerned, +unmodified as we want to intend to use our virtual addresses. The first thing +we see, is that the lower 14 Bits of the translation table base (in my example +`MMUTABLEBASE`), i.e. the start address of the translation table are marked as +SBZ, i.e. should-be-zero. This means, that the translation table should be +aligned to 16 KiB (2^14 Bytes). Using 0x0000 as starting address would not make +much sense, as this is the place for the interrupt vector. The next good place +would be 0x00004000. Adding another 16 KiB to that address is (not a +coincidence) 0x8000, where we put our code. But any other address, which is +aligned to 16 KiB should work, as long as you have enough memory there and not +clobber anything else. + +The figure B4-4 shows, that we take the top 12 bits of the virtual address, +multiply by 4 (or shift left by 2) and add that to the translation table base, +which gives the address of the first level descriptor for that virtual address. +A multiplication by 4 is no coincidence, but rather takes the length of each +descriptor into account (which is exactly 32 bit, or 4 byte). The descriptor +is fetched and interpreted. As long as we leave N=0 the MMU will always look +into the first 12 bits, which replace the first 12 bits of the virtual address. +The last two bits of the descriptor are flags, if they are 0b10, then it is a +1 MB section. If it's something different, then a second level translation +will be triggered, but for now let's focus on the simpler part. + +##### An example + +``` + MMUTABLEBASE = 0x00004000 + virtual address = 0x12345678 + -> first 12 bit (moved to right): 0x00000123 + -> multiplied by 4: 0x0000048c + descriptor for the section: 0x0000448c + let's assume the descriptor was 0xABC00002 + -> physical address 0xABC45678 +``` + +#### 16 MiB Supersections + +So the ARM ARM states, that you can have 16 MiB supersections. This would make +life easier, right? Well, no. You still have to generate 16 descriptors for +each of the possible 1 MiB sections, so you might as well make them 1 MiB big. +You can read up on the differences and try the supersections our, but I'm going +to use 1 MiB sections for now. + +Maybe you figured out a bit of a problem here. Every load and store with the MMU +enabled requires at least one MMU table lookup. The MMU memory accesses of +course don't have to go through the MMU, but every other store or load. This +does have a performance hit. Therefore the MMU caches the last TLB-lookups. +This helps but the conversion has to be done on every requested address. + +### Descriptor format + +I am looking on the subsection about First Level Descriptors in the ARM ARM, +especially the Table B4-1 (pg. B4-27, First-level descriptor format (VMSAv6, +subpages enabled)). + +This table identifies four different sets of last two bits [1:0]: +- `0b00` - this section is unmapped. Attempting to access these addresses will generate a translation fault (Data Abort). The bits [31:2] are ignored by the hardware, although it is recommended to keep valid permissions for the descriptor there. +- `0b01` - for coarse second level table, second level lookup required for translation; allows more fine grained sectioning of the section +- `0b10` - sections descriptor for its associated virtual addresses, no second level lookup +- `0b11` - reserved in VMSAv6 + +For now let's work with the `0b10`-entries. The format of the entry is as follows: + +| | Bits 31:20 | Bits 19:15 | 14,13,12 | 11,10 | 9 | 8:5 | 4 | 3 | 2 | 1,0 | +|---|------------|------------|----------|-------|---|-----|---|---|---|-----| +| Section | Section base address | SBZ | TEX | AP | IMP | Domain | SBZ | C | B | `10` | + +The **section base address** are the 12 top bits of the physical address, which +substitute the top 12 bits of the virtual address. The **C** bit marks the address +region as cacheable. We do absolute not want to cache peripheral regions, +RAM-regions maybe. The C-flag simply asserts bits on the AMDA/AXI (memory) bus +that marks the transaction as cacheable, you still need a cache setup and +enabled for the transaction to actually get cached. If you don't have the +cache for that transaction type enabled, then it just does a normal memory (or +peripheral) operation. + +The **B** bit, means bufferable, as in write buffer. This enables a "cache" +but for writing instead of reading. When writing a value to RAM (or peripheral) +everything is known, the data and the address. The buffer-bit allows some logic +at this level to accept the value and address and continue to write the data to +the slower RAM or peripheral (or cache) and let's the CPU go on executing it's +program. This may give us a performance boost. When a second write appears and +we only have a single place for a transaction, the processor gets stalled until +the first one is complete and the second write-command can be saved to the +buffer. The advantage is that for a number of writes the processor can hand the +needed data to the memory controller and carry on. + +You need to look up the **TEX** bits yourself. I will stick to them being 0b000 +and will not mess with any fancy features here. The combinations of TEX, C and +B bits make some subtle differences, look them up in Table B4-3 (CB + TEX +Encodings). + +The **AP** bits indicate the level of access permissions (see Table B4-1 MMU +access permissions, pg. B4-9), for page table formats, which don't support APX, +value 0 is assumed. The following AP-values are therefore valid: +- `0b00` - No access for anyone; will generate permission fault on every access +- `0b01` - Read/Write permission for privileged mode +- `0b10` - Read/write for privileged mode, read for user mode (writes in user mode trigger permission faults) +- `0b11` - Full access (R/W for everyone) + +The **domain** is a bit trickier to explain. There is a register right next to the translation table base address register which contains 16 different domain specifications. These definitions are 2 bit long each: +- `0b00` - **no access**, any access generates a domain fault +- `0b01` - **client**, accesses are checked against the access permission bits in the TLB entry +- `0b10` - **UNPREDICTABLE** behaviour +- `0b11` - **manager**, accesses are not checked and cannot generate a permission fault + +The domains basically are 16 different definitions which control the behaviour +on access. We can define for example 16 types of applications and assign them +sections. We assign sections to domains by setting the four **domain** bits of +the translation table entry to the number of the definition in the register. +With changing to bits in this register we can then put sections of a domain +into another permission mode, which is quite useful, because we don't need to +change the MMU table. + +For starters we are going to set all of the domains to `0b11` don't check and +all of our sections can have the domain number 0. + +## A simple implementation + +```c + /** + * \brief creates an translation table entry (for sections of size 1 MiB) + * \param[in] virtual the virtual address (only top 12 bits used) + * \param[in] physical the physical address (only top 12 bits used) + * \param[in] flags the flags for the section + **/ + uint32_t mmu_section ( uint32_t virtual, uint32_t physical, uint32_t flags ) + { + uint32_t offset = virtual >> 20; + // plus and or are the same thing here, as MMUTABLEBASE is 14 bit aligned + uint32_t* entry = MMUTABLEBASE | (offset<<2); + + // mask lower 20 bits of physical address then ORR flags and 0x02 for 1 MiB + uint32_t physval = (physical & 0xfff00000) | (flags & 0x7ffc) | 0x02; + + *entry = physval; + return(0); + } + + #define CACHEABLE 0x08 + #define BUFFERABLE 0x04 +``` + +### Filling the table with sections + +Before enabling the MMU itself we need to make sure, that every section of +memory we want to use is defined with a valid entry in the table. If not, +access to that region will trigger a fault handler - if you decide to write +one. Which in turn can access to non mapped memory - which is not good. + +The smallest amount of RAM on a Raspberry Pi is 256 MiB or 0x10000000 bytes. +Our program enters at address 0x8000, so that is within the first +section 0x000xxxxx so we should make that section cacheable and +bufferable. + +```c + mmu_section( 0x00000000,0x00000000, CACHEABLE | BUFFERABLE ); +``` + +This statement will create an entry for the virtual address space 0x000xxxxx +to the physical addresses 0x000xxxxx enable the cache and write buffer. If we +want to use all 256mb we would need to do this for all the sections from +0x000xxxxx to 0x100xxxxx. + + + +We know that for the Raspberry Pi 1 the peripherals, like AUX / UART and such +are in ARM physical space at 0x20xxxxxx. To allow for more RAM on the Raspberry +Pi 2 they needed to that peripheral base address and moved it to 0x3Fxxxxxx. +We can either create 16 1 MiB section entries to cover the whole range of +peripherals or we only define the sections we care to talk to. The UART and the +GPIO are associated with the 0x202xxxxx space. There are a couple of timers +in the 0x200xxxxx space so one entry can cover those. + +``` c + mmu_section(0x20000000,0x20000000,0x0000); //NOT CACHED! + mmu_section(0x20200000,0x20200000,0x0000); //NOT CACHED! + mmu_section(0x3F000000,0x3F000000,0x0000); //NOT CACHED! + mmu_section(0x3F200000,0x3F200000,0x0000); //NOT CACHED! +``` + +These sections are not cached and not buffered, but but we may play with that +to demonstrate what caching a peripheral can do to you, why we need to turn on +the MMU if for no other reason than to get some bare metal performance by +using the cache. + +Now you have to think on a system level here, there are a number +of things in play. We need to plan our memory space, where are we +putting the MMU table, where are our peripherals, where is our program. + +If the only reason for using the MMU is to allow the use of the cache +then just map the whole world 1:1; if you want with the peripherals not cached +and the rest cached. + +### Cache invalidation + +So once our tables are setup then we need to actually turn the +MMU on. When initialising a cache to you want to clean out all the entries in a +safe way. You want to invalidate everyhting, mark every cache line as empty / +available. Likewise you want the TLB caching the MU does to be invalidated, +so the MMU starts up with no valid lines in the cache, that don't match our +entries. Also we want the CPU to do a Data Synchronization Barrier (DSB), so +every explicit memory transaction is finished before the next instruction begins. + +All of the above can be done using the C15 [co-processor]. So, to summarise: +1. Invalidate all caches (Instruction and data, write 0 to `0, c7, c7, 0`) +2. Invalidate the TLB entries (write 0 to `0, c8, c7, 0`) +3. Data synchronisation barrier (write 0 to `0, c7, c10, 4`) +4. Set the domain access controls (write 0xffffffff to `0, c3, c0, 0`, 0b11 for every domain) +5. Set the base address for the translation table (`0,c2,c0,0`) +6. Enable level 1 caches and the MMU in the control register (`0,c1,c0,0`) and some other useful things: + - bit 0 (M) enables MMU + - bit 2 (C) enables level 1 data cache + - bit 11 (Z) enables branch prediction + - bit 12 (I) enables instruction cache + - bit 22 (U) enables non-aligned data access as well as mixed big-/little-endian data access + +What bits of these you want to set is up to you. I would recommend M, C and I, +but I am going ahead and set them all. So simple example code implementing the +the MMU-enabling process might be this: + +```c + .global mmu_init + mmu_init: + mov r1,#0 + // invalidate caches + mcr p15,0,r1,c7,c7,0 + // invalidate TLB entries + mcr p15,0,r1,c8,c7,0 + // data synchronisation barrier + mcr p15,0,r1,c7,c10,4 + + // set all domains to 0b11 + ldr r1, =0xffffffff + mcr p15,0,r1,c3,c0,0 + + // set the translation table base address (remember to align 16 KiB!) + mcr p15,0,r0,c2,c0,0 + + // set the bits mentioned above + ldr r1, =0x00401805 + mrc p15,0,r2,c1,c0,0 + orr r2,r2,r1 + mcr p15,0,r2,c1,c0,0 + + mov pc, lr +``` + +For messing with the translation tables after the MMU is started, you will need +to invalidate the TLB cache again, so let's put this part into its own function. +We don't need to care about the L1 cache, this time. Also ARMv6 introduces a +feature to help with invlidating the TLB, but I'm going with this solution: + +```c + .globl tlb_invalidate + tlb_invalidate: + mov r2,#0 + // invalidate TLB entries + mcr p15,0,r1,c8,c7,0 + // data synchronisation barrier + mcr p15,0,r1,c7,c10,4 + mov pc,lr +``` + +#### (JTAG) Debugging and caching + +Something to note here. Debugging using the JTAG based on-chip-debugger +makes life easier. No SD-card swapping and no more EEPROM-flashing. BUT, +it is not completely without issue. The basic problem is that caches often +seperate instruction fetches from data reads and writes. Let's say you execute +instructions at 0xD0000 (which is cached) and an instruction 0xC000. So you +transfer your programm, set the start address and run again. + +For each instruction in the program the prior instruction in that address might +still be in the instruction cache and the new one in main RAM (or data cache). +So, when running the new program you might still be running the old +instructions, which are fetched back from the instruction cache, not the RAM, +unless an invalidate or flush happens). + +There is also the question of are the instruction and data caches shared? +They can be specific to the core and your setup. Is your JTAG-debugger able to +disable the caches, has it done that for you, or can you do it manually. + +Any time you are using the instruction or data caches you need to be careful +using a JTAG-debugger or even a bootloader type approach depending on its +design as you might end up doing data writes of instructions and going +around the instruction cache or worse. This may be done by your JTAG debugger, +but keep in mind to change back to / from Raspberry Pi 2 when switching between +the Pis. Otherwise this might driver you mad, when you keep downloading new +code but the Pi crashes or behaves unexpectedly. + +## Having fun with address translation + +So the example is going to start with the MMU off and write to +addresses in four different 1MB address spaces, so we can play with the section +descriptors and demonstrate virtual to physical address conversion later. + +```c + // write data to four different 1 MiB sections + PUT32(0x00045678,0x00045678); + PUT32(0x00145678,0x00145678); + PUT32(0x00245678,0x00245678); + PUT32(0x00345678,0x00345678); + + // write the data back to UART + hexstring(GET32(0x00045678)); + hexstring(GET32(0x00145678)); + hexstring(GET32(0x00245678)); + hexstring(GET32(0x00345678)); + // 0D CR '\r' (carriage ret) + uart_send(0x0D); + // 0A LF '\n' (new line) + uart_send(0x0A); + + // Then setup the MMU with at least those four sections + mmu_section(0x00000000,0x00000000,CACHEABLE | BUFFERABLE); + mmu_section(0x00100000,0x00100000,0x0000); + mmu_section(0x00200000,0x00200000,0x0000); + mmu_section(0x00300000,0x00300000,0x0000); + + // and the peripherals: + mmu_section(0x20000000,0x20000000,0x0000); //NOT CACHED! + mmu_section(0x20200000,0x20200000,0x0000); //NOT CACHED! + + // Start the MMU with the instruction and data caches enabled: + mmu_init ( MMUTABLEBASE ); + + // when we read those four addresses back we get the same output + // as we wrote before because we mapped virtual = physical + hexstring(GET32(0x00045678)); + hexstring(GET32(0x00145678)); + hexstring(GET32(0x00245678)); + hexstring(GET32(0x00345678)); + uart_send(0x0D); uart_send(0x0A); +``` + +But what if we swizzle things around? Don't mess with the 0x00000000-section, +because that is where our code is. + +```c + // change the table entries + mmu_section(0x00100000,0x00300000,0x0000); + mmu_section(0x00200000,0x00000000,0x0000); + mmu_section(0x00300000,0x00100000,0x0000); + + // invalidate the TLB + invalidate_tlbs(); + + // and read the addresses again, which we wrote to above + hexstring(GET32(0x00045678)); + hexstring(GET32(0x00145678)); + hexstring(GET32(0x00245678)); + hexstring(GET32(0x00345678)); + uart_send(0x0D); uart_send(0x0A); +``` + +The 0x000xxxxx entry was not modified, so we get 0x000045678 as the output. The +section 0x001xxxxx will read from physical addresses 0x003xxxxx so we get the +0x00345678 output, 0x002xxxxx will translate to the 0x000xxxxx space +so that read gives 0x00045678 and the 0x003xxxxx is mapped to physical +0x001xxxxx giving 0x00145678 as the output. + +So up to this point the output looks like this: + +``` + 00045678 + 00145678 + 00245678 + 00345678 + + 00045678 + 00145678 + 00245678 + 00345678 + + 00045678 + 00345678 + 00045678 + 00145678 +``` + +The first block is with the MMU disabled, the second one with MMU enabled but +1:1 virtual to physical translation, the third one with the non 1:1 translation. + +## Coarse paging + +With coarse paging the logic does not know what kind of translation will be done +until the first level read, so the first step is identical to the translation +above. If it sees a `0b01` as the lower 2 bits of the first level descriptor, +then it knows, it's a coarse page entry and it needs as second level fetch. +Table B4-5 Accessing coarse page table second-level descriptors (pg. B4-30) +shows the logic fetching the second level descriptor. + +### Second level descriptor format + +There are two things to the two level translation. At first we need to set the +first level descriptors accordingly. The format is as follows Table B4-1 +(pg. B4-27, First-level descriptor format (VMSAv6, subpages enabled)). + +| | Bits 31:10 | 9 | 8:5 | 4,3,2 | 1,0 | +|---|------------|---|-----|-------|-----| +| Coarse Page Table | Coarse Page Table base Address | IMP | Domain | IMP | `01` | + +The bits 4:2 are implementation defined and should be zero (SBZ) for VMSAv6. +The domain-bits are used as above. The bits 31:10 are used as the base address +of the second level page table. This second level page table needs to be +aligned to 1 KiB in memory. + +So after fetching the first level descriptor the bits 31:10 of the entry will +be used as bits 31:10 of the second level descriptor, i.e. it will be used as +base address for the second level table. The virtual address bits 19:12 will be +used to navigate inside that second level table, i.e. shift completely to the +right, multiplied by 4 then used added onto the base address. The lowest 2 bits +of the address of the second level entry is always zero. Note that there are +256 possibilities to fill the bits 19:12 of the virtual address, i.e. the +section is divided into 256 parts, i.e. 4 KiB pages. + +The second level descriptor looks like this (for small pages, for more see +Table 4-3 Second level descriptor format (subpages enabled), pg. 4-31): + +| | Bits 31:12 | 11,10 | 9,8 | 7,6 | 5,4 | 3 | 2 | 1,0 | +|---|------------|-------|-----|-----|-----|---|---|-----| +| Small page | Small page base address | AP3 | AP2 | AP1 | AP0 | C | B | `10` | + +Note, that there are four **AP**-fields here. The small page is divided further +into four blocks of the same size (i.e. in the case of small pages 1 KiB), which +have their own AP-Access control. AP0 applies to the block with the lowest base +address. You can set them all to `0b11` - full access and not care, or have +fine grained access control over the blocks of that page. + +### An example + +This example is a bit crazy, as the address of the second level descriptor is +an address, where we don't even have RAM anymore on the Raspberry Pi. Normally +you want to keep your second level tables somewhere near the first level table, +so you have the memory managment information in a confined space. + +``` + MMUTABLEBASE = 0x00004000 + virtual address = 0x12345678 + -> bits 19:12 of the virtual address: 0x00000045 + address of the first level descriptor: 0x0000448c + let's assume a descriptor value of: 0xABCDE001 + base address of second level table: 0xABCDE000 + offset to the second level table: 0x00000114 + address of the second level descriptor: 0xABCDE114 +``` + +### A simple implementation + +```c + /** + * \brief creates an translation table entry (for sections of size 1 MiB) + * \param[in] virtual the virtual address (only top 12 bits used) + * \param[in] physical the physical address (only top 12 bits used) + * \param[in] flags the flags for the section + **/ + uint32_t mmu_page ( uint32_t virtual, uint32_t physical, uint32_t flags, uint32_t secondbase ) + { + uint32_t offset = virtual >> 20; + // plus and or are the same thing here, as MMUTABLEBASE is 14 bit aligned + uint32_t* entry = MMUTABLEBASE | (offset<<2); + // mask lower 20 bits of physical address then ORR flags and 0x01 for coarse translation + uint32_t entryval = (secondbase & 0xfffffc00) | (flags & 0xf0) | 0x01; + + // set first level descriptor + *entry = entryval; + + // mask everything except bits 19:12 + offset = (virtual >> 12) & 0xff; + // form the second level + uint32_t* secondLevelEntry = (secondbase & 0xfffffc00) | (offset << 2); + + // form the value of the second level descriptor + // bytes 31:12 are the page base address, flags contain B,C, AP_x = 0b11 + // for all and the 0x02 at the end to identify the entry as small page + uint32_t physval = (physical & 0xfffff000) | 0xff0 | (flags & 0xc) | 0x02; + + // set the second level descriptor + *secondLevelEntry = physval; + return(0); + } +``` + +So let's assign some sections to coarse translation: + +```c + mmu_small(0x0AA45000,0x00145000,0,0x00000400); + mmu_small(0x0BB45000,0x00245000,0,0x00000800); + mmu_small(0x0CC45000,0x00345000,0,0x00000C00); + mmu_small(0x0DD45000,0x00345000,0,0x00001000); + mmu_small(0x0DD46000,0x00146000,0,0x00001000); + + invalidate_tlb(); +``` + +Let's look in the last two `mmu_small`-statements here. you will notice, that +the `secondbase`-parameter is the same here. This is in fact wanted, as I want +to add an entry into the secondary table I assigned before, not set a new one, +i.e. orphaning the old one. So let's assume I would set a new secondary table +base address like this: + +```c + mmu_small(0x0DD45000,0x00345000,0,0x00001000); + mmu_small(0x0DD46000,0x00146000,0,0x00001400); +``` + +When I try to access an address in the page 0x0DD45xxx, then the MMU would look +inside a secondary table located at 0x00001400, which of course does not contain +our previously set entry for the small page. But it will definitely find +something there, and probably behave unexpected, if we are not aware of our +mistake here. So always make sure the secondary table base addresses of the +pages in the same section are the same. + +## Access violation + +First we want to set a domain to 0x00, so accessing a section with that domain +will definitely trigger an access violation. I will assume we wrote that to +domain number 1. + +```c + // set the domain of a section to 0x01 + mmu_section(0x00100000,0x00100000,0x0020); + invalidate_tlb(); + + // then read the data from the sections + hexstring(GET32(0x00045678)); + hexstring(GET32(0x00145678)); + hexstring(GET32(0x00245678)); + hexstring(GET32(0x00345678)); + uart_send(0x0D); uart_send(0x0A); +``` + +I expect that second read-statement to trigger a Data Abort-Exception, so I +want to write an exception handler, to read the status information of that +exception. We need to following registers of the C15 co-processor: +- `0,c5,c0,0` - Data Fault Status Register +- `0,c5,c0,1` - Instruction Fault Status Register + +### Data Fault Status Register + +This register holds the source of the last data fault. The bits have the +following functions: + +| Bit 31:13 | 12 | 11 | 10 | 9 | 8 | 7:4 | 3:0 | +|-----|----------|----|----|---|---|-----|-----| +| SBZ | SD | RW | S | 0 | 0 | Domain | Status | + +**SD** indicates an AXI Decode or Slave error caused the abort (only valid for +external aborts, for all other should be zero). **RW** indicates whether a read +(0) or a write (1) access caused the abort. The **S**-flag is part of the status +field. The **Domain** bits indicate the domain which was accessed when the abort +occurred. The **Status** bits show the type of fault generated. See the +[Data Fault Status Register]-manual for a list. + +### Instruction Fault Status Register + +This register holds the source of the last instruction fault. The bits have the following functions: + +| Bit 31:13 | 12 | 11 | 10 | 9 :4 | 3:0 | +|-----|----------|----|----|------|-----| +| SBZ | SD | SBZ | 0 | SBZ | Status | + +See the [Instruction Fault Status Register]-manual for the list of status combinations. + +### Reading the status registers + +```c + data_abort: + // save the link-register + mov r6,lr + // get the last executed instruction + ldr r8,[r6,#-8] + + // reading the status register + mrc p15,0,r4,c5,c0,0 ;@ data/combined + mrc p15,0,r5,c5,c0,1 ;@ instruction + mov sp,#0x00004000 + + // print data fault status register + mov r0,r4 + bl hexstring + + // print instruction fault status register + mov r0,r5 + bl hexstring + + // print the link register + mov r0,r6 + bl hexstring + + // print the bit-representation of the last executed instruction + mov r0,r8 + bl hexstring + + b hang +``` + +Running the code results in: + +``` + 00045678 + 00000019 + 00000000 + 00008104 + E5900000 +``` + +The first line is the one correct data-read we do in our code above. The +next value is the data fault status register, which indicates that the domain +0x01 was accessed and aborted with an 0x09 fault, i.e. a Domain Section fault. +The third line is the value of the instruction fault status register, which +indicates a status of 0x0, i.e. "no function, reset value". That probably means +no fault happened. + +The fourth line is the link register, i.e. the address to the instruction which +would have been executed next, and the last line is the binary representation +of the instruction which caused the fault. Use a disassembler to view the +instruction in all its mnemonic glory: + +``` + 80fc: e5900000 ldr r0, [r0] +``` + +# Conclusion + +This is just a simple intro to MMUs, just enough to be dangerous. The MMU is one +of the simplest peripherals to program so long as bit manipulations are not +something that causes you to lose sleep. But if you mess it up even a bit, or +forget something, you can crash in spectacular ways (often silently without any +way of knowing what really happened. Debugging can be hard at best. + +The ARM ARM indicates that ARMv6 adds a feature of separating the data from the +instructions from the MMUs perspective, which is an interesting thought (see the +JTAG-debugging comments). + +[ARM ARM]: https://www.scss.tcd.ie/~waldroj/3d1/arm_arm.pdf +[co-processor]: http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.ddi0301h/ch03s02s01.html +[Data Fault Status Register]: http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.ddi0301h/Bgbiaghh.html +[Instruction Fault Status Register]: http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.ddi0301h/Bgbccfgi.html diff --git a/mmu/coarse_translation.ps b/mmu/coarse_translation.ps deleted file mode 100644 index 234265d..0000000 Binary files a/mmu/coarse_translation.ps and /dev/null differ diff --git a/mmu/notmain.c b/mmu/notmain.c index a33719a..46fd6c2 100644 --- a/mmu/notmain.c +++ b/mmu/notmain.c @@ -1,15 +1,13 @@ - -//------------------------------------------------------------------------- -//------------------------------------------------------------------------- +#include extern void PUT32 ( unsigned int, unsigned int ); extern void PUT16 ( unsigned int, unsigned int ); extern unsigned int GET32 ( unsigned int ); -extern void start_mmu ( unsigned int, unsigned int ); +extern void mmu_init ( uint32_t ); +extern void mmu_domain ( uint32_t ); extern void stop_mmu ( void ); -extern void invalidate_tlbs ( void ); -extern void invalidate_caches ( void ); +extern void invalidate_tlb ( void ); extern void uart_init ( void ); extern void uart_send ( unsigned int ); @@ -17,43 +15,64 @@ extern void uart_send ( unsigned int ); extern void hexstrings ( unsigned int ); extern void hexstring ( unsigned int ); -unsigned int system_timer_low ( void ); - #define MMUTABLEBASE 0x00004000 -//------------------------------------------------------------------- -unsigned int mmu_section ( unsigned int vadd, unsigned int padd, unsigned int flags ) +#define CACHEABLE 0x08 +#define BUFFERABLE 0x04 + +/** + * \brief creates an translation table entry (for sections of size 1 MiB) + * \param[in] virtual the virtual address (only top 12 bits used) + * \param[in] physical the physical address (only top 12 bits used) + * \param[in] flags the flags for the section + **/ +uint32_t mmu_section ( uint32_t virtual, uint32_t physical, uint32_t flags ) { - unsigned int ra; - unsigned int rb; - unsigned int rc; - - ra=vadd>>20; - rb=MMUTABLEBASE|(ra<<2); - rc=(padd&0xFFF00000)|0xC00|flags|2; - //hexstrings(rb); hexstring(rc); - PUT32(rb,rc); + uint32_t offset = virtual >> 20; + // plus and or are the same thing here, as MMUTABLEBASE is 14 bit aligned + uint32_t* entry = (uint32_t*) (MMUTABLEBASE | (offset<<2)); + + // mask lower 20 bits of physical address then ORR flags and 0x02 for 1 MiB + uint32_t physval = (physical & 0xfff00000) | (flags & 0x7ffa) | 0x02; + + *entry = physval; return(0); } -//------------------------------------------------------------------- -unsigned int mmu_small ( unsigned int vadd, unsigned int padd, unsigned int flags, unsigned int mmubase ) + + +/** + * \brief creates an translation table entry (for sections of size 1 MiB) + * \param[in] virtual the virtual address (only top 12 bits used) + * \param[in] physical the physical address (only top 12 bits used) + * \param[in] flags the flags for the section + **/ +uint32_t mmu_page ( uint32_t virtual, uint32_t physical, uint32_t flags, uint32_t secondbase ) { - unsigned int ra; - unsigned int rb; - unsigned int rc; - - ra=vadd>>20; - rb=MMUTABLEBASE|(ra<<2); - rc=(mmubase&0xFFFFFC00)/*|(domain<<5)*/|1; - //hexstrings(rb); hexstring(rc); - PUT32(rb,rc); //first level descriptor - ra=(vadd>>12)&0xFF; - rb=(mmubase&0xFFFFFC00)|(ra<<2); - rc=(padd&0xFFFFF000)|(0xFF0)|flags|2; - //hexstrings(rb); hexstring(rc); - PUT32(rb,rc); //second level descriptor + uint32_t offset = virtual >> 20; + // plus and or are the same thing here, as MMUTABLEBASE is 14 bit aligned + uint32_t* entry = (uint32_t*) (MMUTABLEBASE | (offset<<2)); + // mask lower 20 bits of physical address then ORR flags and 0x01 for coarse translation + uint32_t entryval = (secondbase & 0xfffffc00) | (flags & 0xf0) | 0x01; + + // set first level descriptor + *entry = entryval; + + // mask everything except bits 19:12 + offset = (virtual >> 12) & 0xff; + // form the second level + uint32_t* secondLevelEntry = (uint32_t*) ((secondbase & 0xfffffc00) | (offset << 2)); + + // form the value of the second level descriptor + // bytes 31:12 are the page base address, flags contain B,C, AP_x = 0b11 + // for all and the 0x02 at the end to identify the entry as small page + uint32_t physval = (physical & 0xfffff000) | 0xff0 | (flags & 0xc) | 0x02; + + // set the second level descriptor + *secondLevelEntry = physval; return(0); } + + //------------------------------------------------------------------------ int notmain ( void ) { @@ -95,7 +114,7 @@ int notmain ( void ) mmu_section(0x20000000,0x20000000,0x0000); //NOT CACHED! mmu_section(0x20200000,0x20200000,0x0000); //NOT CACHED! - start_mmu(MMUTABLEBASE,0x00000001|0x1000|0x0004); //[23]=0 subpages enabled = legacy ARMv4,v5 and v6 + mmu_init( MMUTABLEBASE ); hexstring(GET32(0x00045678)); hexstring(GET32(0x00145678)); @@ -106,7 +125,7 @@ int notmain ( void ) mmu_section(0x00100000,0x00300000,0x0000); mmu_section(0x00200000,0x00000000,0x0000); mmu_section(0x00300000,0x00100000,0x0000); - invalidate_tlbs(); + invalidate_tlb(); hexstring(GET32(0x00045678)); hexstring(GET32(0x00145678)); @@ -114,16 +133,16 @@ int notmain ( void ) hexstring(GET32(0x00345678)); uart_send(0x0D); uart_send(0x0A); - mmu_small(0x0AA45000,0x00145000,0,0x00000400); - mmu_small(0x0BB45000,0x00245000,0,0x00000800); - mmu_small(0x0CC45000,0x00345000,0,0x00000C00); - mmu_small(0x0DD45000,0x00345000,0,0x00001000); - mmu_small(0x0DD46000,0x00146000,0,0x00001000); + mmu_page(0x0AA45000,0x00145000,0,0x00000400); + mmu_page(0x0BB45000,0x00245000,0,0x00000800); + mmu_page(0x0CC45000,0x00345000,0,0x00000C00); + mmu_page(0x0DD45000,0x00345000,0,0x00001000); + mmu_page(0x0DD46000,0x00146000,0,0x00001000); //put these back mmu_section(0x00100000,0x00100000,0x0000); mmu_section(0x00200000,0x00200000,0x0000); mmu_section(0x00300000,0x00300000,0x0000); - invalidate_tlbs(); + invalidate_tlb(); hexstring(GET32(0x0AA45678)); hexstring(GET32(0x0BB45678)); @@ -137,9 +156,9 @@ int notmain ( void ) uart_send(0x0D); uart_send(0x0A); //access violation. - + mmu_domain ( 0xffffff03 ); mmu_section(0x00100000,0x00100000,0x0020); - invalidate_tlbs(); + invalidate_tlb(); hexstring(GET32(0x00045678)); hexstring(GET32(0x00145678)); diff --git a/mmu/novectors.s b/mmu/novectors.s index deaf533..a1e36fd 100644 --- a/mmu/novectors.s +++ b/mmu/novectors.s @@ -74,22 +74,32 @@ handler: b hang data_abort: + // save the link-register mov r6,lr + // get the last executed instruction ldr r8,[r6,#-8] + + // reading the status register mrc p15,0,r4,c5,c0,0 ;@ data/combined mrc p15,0,r5,c5,c0,1 ;@ instruction mov sp,#0x00004000 - bl hexstring + + // print data fault status register mov r0,r4 bl hexstring + + // print instruction fault status register mov r0,r5 bl hexstring + + // print the link register mov r0,r6 bl hexstring + + // print the bit-representation of the last executed instruction mov r0,r8 bl hexstring - mov r0,r7 - bl hexstring + b hang .globl PUT32 @@ -106,25 +116,35 @@ GET32: dummy: bx lr -.globl start_mmu -start_mmu: - mov r2,#0 - mcr p15,0,r2,c7,c7,0 ;@ invalidate caches - mcr p15,0,r2,c8,c7,0 ;@ invalidate tlb - mcr p15,0,r2,c7,c10,4 ;@ DSB ?? - - mvn r2,#0 - bic r2,#0xC - mcr p15,0,r2,c3,c0,0 ;@ domain - +.global mmu_domain +mmu_domain: + mcr p15, 0, r0, c3, c0, 0 + mov pc, lr + +.global mmu_init +mmu_init: + mov r1,#0 + // invalidate caches + mcr p15,0,r1,c7,c7,0 + // invalidate TLB entries + mcr p15,0,r1,c8,c7,0 + // data synchronisation barrier + mcr p15,0,r1,c7,c10,4 + + // set all domains to 0b11 + ldr r1, =0xffffffff + mcr p15,0,r1,c3,c0,0 + + // set the translation table base address (remember to align 16 KiB!) mcr p15,0,r0,c2,c0,0 ;@ tlb base - mcr p15,0,r0,c2,c0,1 ;@ tlb base - + + // set the bits mentioned above + ldr r1, =0x00401805 mrc p15,0,r2,c1,c0,0 orr r2,r2,r1 mcr p15,0,r2,c1,c0,0 - - bx lr + + mov pc, lr .globl stop_mmu stop_mmu: @@ -135,8 +155,8 @@ stop_mmu: mcr p15,0,r2,c1,c0,0 bx lr -.globl invalidate_tlbs -invalidate_tlbs: +.globl invalidate_tlb +invalidate_tlb: mov r2,#0 mcr p15,0,r2,c8,c7,0 ;@ invalidate tlb mcr p15,0,r2,c7,c10,4 ;@ DSB ?? diff --git a/mmu/periph.c b/mmu/periph.c index 2e0d49b..a2489a4 100644 --- a/mmu/periph.c +++ b/mmu/periph.c @@ -6,7 +6,6 @@ extern void PUT32 ( unsigned int, unsigned int ); extern void PUT16 ( unsigned int, unsigned int ); extern void PUT8 ( unsigned int, unsigned int ); extern unsigned int GET32 ( unsigned int ); -extern void BRANCHTO ( unsigned int ); extern void dummy ( unsigned int ); #define SYSTIMERCLO (0x20003004) @@ -120,13 +119,7 @@ void uart_init ( void ) PUT32(GPPUDCLK0,0); PUT32(AUX_MU_CNTL_REG,3); } -//------------------------------------------------------------------------- -unsigned int system_timer_low ( void ) -{ - return(GET32(SYSTIMERCLO)); -} -//------------------------------------------------------------------------- -//------------------------------------------------------------------------- + //------------------------------------------------------------------------- diff --git a/mmu/section_translation.ps b/mmu/section_translation.ps deleted file mode 100644 index d81ebdc..0000000 Binary files a/mmu/section_translation.ps and /dev/null differ