diff --git a/mmu/Makefile b/mmu/Makefile
index 537efe6..ce2dc2a 100644
--- a/mmu/Makefile
+++ b/mmu/Makefile
@@ -30,9 +30,3 @@ notmain.hex : memmap novectors.o periph.o notmain.o
 	$(ARMGNU)-objdump -D notmain.elf > notmain.list
 	$(ARMGNU)-objcopy notmain.elf -O ihex notmain.hex
 
-
-
-
-
-
-
diff --git a/mmu/README b/mmu/README
deleted file mode 100644
index 5924ff0..0000000
--- a/mmu/README
+++ /dev/null
@@ -1,863 +0,0 @@
-
-See the top level README file for more information on documentation
-and how to run these programs.
-
-This example demonstrates ARM MMU basics.
-
-You will need the ARM ARM (ARM Architectural Reference Manual) for
-ARMv5.  I have a couple of pages included in this repo, but you still
-will need the ARM ARM.
-
-This code so far does not work on the Raspberry pi 2 yet, will get
-that working at some point, the knowledge here still applies, I expect
-the differences to be subtle between ARMv6 and 7 but will see.
-
-
-
--- NEED TO RE-WRITE THIS AGAIN, SUBPAGES ENABLED, COARSE 1KB TABLES  --
-
-
-
-
-So what an MMU does or at least what an MMU does for us is it
-translates virtual addresses into physical addresses as well as
-checking access permissions, and gives us control over cachable
-regions.
-
-So what does all of that mean?
-
-There is a boundary inside the chip around the ARM core, part of that
-boundary is the memory interface for the ARM for lack of a better term
-how the ARM accesses the world.  Nothing special, all processors have
-some sort of address and data based interface between the processor and
-the ram and peripherals.  That boundary uses physical addresses, that
-boundary is on the memory side or "world side" of the ARM's mmu.
-Within the ARM core there is the "processor side" of the mmu, and all
-load and store (and fetch) accesses to the world go through the mmu.
-
-When the ARM powers up the mmu is disabled, which means all accesses
-pass through unmodified making the "processor side" or virtual address
-space equal to the world side physical address space.  All of my
-examples thus far, blinkers and such are based on physical addresses.
-We already know that elswhere in the chip is another address
-translation of some sort, because the manual is written for 0x7Exxxxxx
-based adresses, but the ARM's physical addresses for those same things
-is 0x20xxxxxx for the raspi 1 and 0x3Fxxxxxx for the raspi 2.  For this
-discussion we only care about that other mystery address translation
-we care about the ARM and the ARM mmu.
-
-So when I say the mmu translates virtual addresses into physical
-addresses.  What that means is on the processor side there is an address
-you are accessing, but that does not have to be the same address on
-the physical address side of the mmu.  Lets say for example I am
-running a program on an operating system, Linux lets say, and I need
-to compile that program before I can use it and I need to link it for
-an address space so lets say that I link it to enter at address 0x8000
-and use memory from 0x0000 to whatever I need and/or whatever is
-available.  So that is all fine, except what if I have two programs
-and I want both running "at the same time" how can both use the same
-address space without clobbering each other?  The answer is neither is
-at that address space the virtual address WHEN RUNNING one of them is
-in the virtual address space 0x00000000 to some number, but in reality
-program 1 might have that mapped to the physical address 0x01000000 and
-program 2 might have its 0x00000000 to some number mapped to 0x02000000.
-So when program 1 thinks it is writing to address 0xABCDE it is really
-writing to 0x010ABCDE and when program 2 thinks it is writing to
-address 0xABCDE it is really writing to 0x020ABCDE.
-
-If you think about it it doesnt make any sense to allow any virtual
-address to map to any physical address, for example from 0x12345678
-to 0xAABBCCDD.  Think about it, we are talking about a 32 bit address
-space or 4Giga addresses.  If we allowed any address to convert to
-any other address we would need a 4Giga to 4Giga map, we would actually
-need 16Gigabytes just to hold the 4Giga physical adresses worst case.
-To cut to the chase ARM has one option where the top 12 bits of the
-virtual get translated to 12 bits of physical, the lower 20 bits in
-that case are the same between the virtual and physical.  This means
-we can control 1MByte of address space with one definition, and have
-4096 entries in some table somewhere to convert from virtual to
-physical.  That is quite managable.  The minimum we would need to
-store are the 12 replacement bits per table entry, but ARM uses a full
-32 bit entry, which for this 1MB flavor, has the 12 physical bits plus
-some other control bits.
-
-What does cachable regions mean?  The mmu also gives you the feature
-of being able to choose per descriptor whether or not you want to
-enable caching on that block.  One obvious reason would be for the
-peripherals.  Think about a timer, ideally you read the current timer
-tick and each time you read it you get the current timer tick and
-as it changes you see it change.  But what if when we turned on the
-data cache it covered all addresses, all loads and stores?  Then you
-read the timer once, get a value, read it again, now you get the
-cached value over and over again you dont see the real timer value
-in the peripheral.  That is not good, you cannot manage a peripheral
-if you cannot read its status register or read the data coming out
-of it, etc.  So at a minimum your peripherals need to be in non-cached
-blocks.  Likewise, if you have some ram that is shared by more than
-one resource, say the GPU and the ARM or for the raspberry pi 2 shared
-between multiple ARM cores, you have a similar situation, another
-resource may change the ram on the far side of your cache but your
-cache assumes it has a copy of what is in ram.  Basically a cache
-only helps you if whatever on the far side of it is only modified by
-writes through the cache, if there are ways to change the data on
-the far side you should not cache that area.   The mmu gives you
-the ability to control cached and non-cahced spaces.
-
-What is meant by access permissions?  Lets think about those two
-programs running "at the same time" on some operating system (Linux
-for example) you dont want to allow one program to gain access to
-the operating systems data nor some other programs data.  Some
-operating systems sure that are meant for only running trusted and
-well mannered programs.  But you dont want some video game on your
-home computer to have access to your banking account data in another
-window/program?  The mechanisms vary across processor families but
-an important job for the mmu is to provide a protection mechanism.
-Such that when a particular program has a time slice on the processor
-there is some mechanism to allow or restrict memory spaces.  If some
-code accesses an address that it does not have permission for then
-an abort happens and the processor is notified.  An interesting
-side effect of this is that this doesnt have to be fatal, in fact it
-could be by design.  Think of a virtual machine, you could let the
-virtual machine software run on the processor, and when it accesses
-one of its peripherals the real operating system gets an abort but
-instead of killing the virtual machine it actually simulates the
-peripheral and lets the virtual machine keep running.  Another one
-that you have probably run into is when you run out of ram in your
-computer, the notion of virtual memory which is differen than virtual
-address space.  Virtual memory in this case is when your program
-ventures off the end of its allowed address space into ram it thinks
-it has.  The operating system gets an abort, finds some ram from
-some other program, swaps that ram to disk for example, then allows
-the program that was running to have a little more ram by mapping it
-back in and allowing it to run.  Later when the program whose data
-got swapped to disk needs it it swaps back and whatever was in the
-ram it swaps with then goes to disk.  The term swap comes from the
-idea that these blocks of ram are swapped back and forth to disk,
-program A's ram goes to disk and is swapped with program T's, then
-program T's is swapped with program K's and so on.  This is why
-starting right after you venture off that edge from real ram to
-virtual, your computers performance drops dramatically and disk
-activity goes way up, the more things running the more swapping going
-on and disk is significantly slower than ram.
-
-As with all baremetal programming, wading through documentation is
-the bulk of the job.  Definitely true here, with the unfortunate
-problem that ARM's docs dont all look the same from one Archtectural
-Reference Manual to an other.  We have this other problem that we
-are techically using an ARMv6 (architecture version 6)(for the raspi 1)
-but when you go to ARM's website there is an ARMv5 and then ARMv7 and
-ARMv8, but no ARMv6.  Well the ARMv5 manual is actually the original
-ARM ARM, that I assume they realized couldnt maintain all the
-architecture variations forever in one document, so they perhaps
-wisely went to one ARM ARM per rev.  With respect to the MMU, the ARMv5
-reference manual covers the ARMv4 (I didnt know there was an mmu option
-there) ARMv5 and ARMv6, and there is mode such that you can have the
-same code/tables and it works on all three, meaning you dont have to
-if-then-else your code based on whatever architecture you find.  This
-raspi 1 example is based on subpages enabled which is this legacy or
-compatibility mode across the three.
-
-I am mostly using the ARMv5 Architectural Reference Manual.
-ARM DDI0100I.
-
-The 1MB sections mentioned above are called...sections...The ARM
-mmu also has blobs that are smaller sizes 4096 byte pages for
-example, will touch on those two sizes.  The 4096 byte one is called
-a small page.
-
-As mentioned above, 32 bit address space, 1MB is 20 bits so 32-20 is
-12 bits or 4096 possible combinations or the address space is broken
-up into 4096 1MB sections.  The top 12 bits of the virtual address
-get translated to 12 bits of physical.  No rules on the translation
-you can have virtual = physical or have any combination, or have
-a bunch of virtual sections point at the same physical space, whatever
-you want/need.
-
-ARM uses the term Virtual Memory System Architecture or VMSA and
-they say things like VMSAv6 to talk about the ARMv6 VMSA.  There
-is a section in the ARM ARM titled Virtual Memory System Architecture.
-In there we see the coprocessor registers, specifically CP15 register
-2 is the translation table base register.
-
-
-So the ARMv5 ARM ARM (ARM Architectural Reference Manual) is what
-we need now.  See the top level README for finding this document,
-I have included a few pages in the form of postscript, any decent pdf
-viewer should be able to handle these files.  Before the pictures
-though, the section in quesiton is titled Virtual Memory System
-Architecture.  In the CP15 subsection register 2 is the the translation
-table base register.  There are three opcodes which give us access to
-three things, TTBR0, TTBR1 and the control register.  
-
-First we read this comment
-
-If N = 0 always use TTBR0. When N = 0 (the reset case), the translation
-table base is backwards compatible with earlier versions of the
-architecture.
-
-That is the one we want, we will leave that as N = 0 and not touch it
-and use TTBR0
-
-Now what the TTBR0 description initially is telling me that bit 31
-down to 14-n or 14 in our case since n = 0 is the base address, in
-PHYSICAL address space.  Note the mmu cannot possibly go through the
-mmu to figure out how to go through the mmu, the mmu itself only
-operates on physical space and has direct access to it.  In a second
-we are going to see that we need the base address for the mmu table
-to be aligned to 16384 bytes.  (2 to the power 14, the lower 14 bits
-of our TLB base address needs to be all zeros).
-
-We write that register using
-
-    mcr p15,0,r0,c2,c0,0 ;@ tlb base
-
-TLB = Translation Lookaside Buffer.  As far as we are concerned think
-of it as an array of 32 bit integers, each integer (descriptor) being
-used to completely or partially convert from virtual to physical and
-describe permissions and caching.
-
-My example is going to have a define called MMUTABLEBASE which will
-be where we start our TLB table.
-
-Here is the reality of the world.  Some folks struggle with bit
-manipulation, orring and anding and shifting and such, some dont.  The
-MMU is logic so it operates on these tables in the way that logic would,
-meaning from a programmers perspective it is a lot of bit manipulation
-but otherwise is relatively simple to something a program could do.  As
-programmers we need to know how the logic uses portsion of the virtual
-address to look into this descriptor table or TLB, and then extracts
-from those bits the next thing it needs to do.  We have to know this so
-that for a particular virtual address we can place the descriptor we
-want in the place where the hardware is going to find it.  So we need
-a few lines of code plus some basic understanding of what is going on.
-Just like bit manipulation causes some folks to struggle, reading
-a chapter like this mmu chapter is equally daunting.  It is nice to
-have somehone hold your hand through it.  Hopefully I am doing more
-good than bad in that respect.
-
-There is a file, section_translation.ps in this repo, you should be
-able to use a pdf viewer to open this file.  The figure on the
-second page shows just the address translation from virtual to physical
-for a 1MB section.  This picture uses X instead of N, we are using an
-N = 0 so that means X = 0.   The translation table base at the top
-of the diagram is our MMUTABLEBASE, the address in physical space
-of the beginning of our first level TLB or descriptor table.  The
-first thing we need to do is find the table entry for the virtual
-address in question (the Modified virtual address in this diagram,
-as far as we are concerned it is unmodified it is the virtual
-address we intend to use).  The first thing we see is the lower
-14 bits of the translation table base are SBZ = should be zero.
-Basically we need to have the translation table base aligned on a
-16Kbyte boundary (2 to the 14th is 16K).  It would not make sense
-to use all zeros as the translation table base, we have our reset
-and interrupt vectors at and near address zero in the arms address
-space so the first sane address would be 0x00004000.  The first
-level descriptor is based on the top 12 bits of the virtual address
-or 4096 entries, that is 16KBytes (not a coincidence), 0x4000 + 0x4000
-is 0x8000, where our arm programs entry point is, so we have space
-there if we want to use it.  But any address with the lower 14 bits
-being zero will work so long as you have enough memory at that address
-and you are not clobbering anything else that is using that memory
-space.
-
-So what this picture is showing us is that we take the top 12 bits
-of the virtual address, multiply by 4 or shift left 2, and add tat
-to the translation table base, this gives the address for the first
-level descriptor for that virtual address.  The diagram shows the
-first level fetch which returns a 32 bit value that we have placed
-in the table.  If the lower 2 bits of that first level descriptor are
-0b10 then this is a 1MB Section.  If a 1MB section then the top 12
-bits of the first level descriptor replace the top 12 bits of the
-virtual address to convert it into a physical address.  Understand
-here first and foremost so long as we do the N = 0 thing, the first
-level descriptor or the first thing the mmu does is look at the top
-12 bits of the virtual address, always.  If the lower two bits of
-the first level descriptor are not 0b10 then we get into
-a second level descriptor and more virtual bits come into play, but
-for now if we start by learning just 1MB sections, the conversion
-from virtual to physical only cares about the top 12 bits of the
-address.  So for 1MB sections we dont have to concentrate on every
-actual address we are going to access we only need to think about
-the 1MB aligned ranges.  The uart for example on the raspi 1 has
-a number of registers that start with 0x202150xx, if we use a 1MB
-section for those we only care about the 0x202xxxxx part of the
-address.  To not have to change our code we would want to have
-the virtual = physical for that and do not mark it as cacheable.
-
-So if my MMUTABLEBASE was 0x00004000 and I had a virtual address of
-0x12345678 then the hardware is going to take the top 12 bits of that
-address 0x123, multiply by 4 and add that to the MMUTABLEBASE.
-0x4000+(0x123<<2) = 0x448C.  and that is the address the mmu is going
-to use for the first-level lookup.  Ignoring the other bits in the
-descriptor for now, if the first-level descriptor has the value
-0xABC00002, the lower two bits are 0x10, a 1MB section, so the top
-12 bits replace the virtual addresses top 12 bits and our 0x12345678
-is converted to the physical address 0xABC45678.
-
-
-Now they have this optional thing called a supersection which is a 16MB
-sized thing rather than 1MB and one might think that that would make
-life easier, right?  Wrong.  No matter what, assuming the N = 0 thing
-the first level descriptor is found using the top 12 bits of the
-virtual address, so in order to do some 16MB thing you need 16 entries
-one for each of the possible 1MB sections.  If you are already
-generating 16 descriptors might as well just make them 1MB sections,
-you can read up on the differences between super sections and sections
-and try them if you want.  For what I am doing here dont need them,
-just wanted to point out you still need 16 entries per super section.
-
-Hopefully I have not lost you yet with this address manipulation,
-and maybe you are one step ahead of me, yes EVERY load and store with
-the mmu enabled requires at least one mmu table lookup, the mmu when it
-accesses this memory does not go through itself, but EVERY other fetch
-and load and store.  Which does have a performance hit, they do have
-a bit of a cache in the mmu to store the last so many tlb lookups.
-That helps, but you cannot avoid the mmu having to do the conversion
-on every address.
-
-In the ARM ARM I am looking at the subsection on first-level descriptors
-has a table:
-Table B4-1 First-level descriptor format (VMSAv6, subpages enabled)
-What this is telling us is that if the first-level descriptor, the
-32 bit number we place in the right place in the TLB, has the lower
-two bits 0b10 then that entry defines a 1MB section and the mmu can get
-everything it needs from that first level descriptor.  But if the
-lower two bits are 0b01 then this is a coarse page table entry and
-we have to go to a second level descriptor to complete the
-conversion from virtual to physical.  Not every address will need
-this only the address ranges we want to be more coarsely divided than
-1MB.  Or the other way of saying it is of we want to control an
-address range in chunks smaller than 1MB then we need to use pages
-not sections.  You can certainly use pages for the whole world, but
-if you do the math, 4096Byte pages would mean your mmu table needs
-to be 4MB+16K worst case.  And you have to do more work to set that
-all up.
-
-The coarse_translation.ps file I have included in this repo starts
-off the same way as a section, has to the logic doesnt know what
-you want until it sees the first level descriptor.  If it sees a
-0b01 as the lower 2 bits of the first level descriptor then this is
-a coarse page table entry and it needs to do a second level fetch.
-The second level fetch does not use the mmu tlb table base address
-bits 31:10 of the second level address plus bits 19:12 of the
-virtual address (times 4) are where the second level descriptor lives.
-Note that is 8 more bits so the section is divided into 256 parts, this
-page table address is similar to the mmu table address, but it needs
-to be aligned on a 1K boundry (lower 10 bits zeros) and can be worst
-case 1KBytes in size.
-
-The second level descriptor format defined in the ARM ARM (small pages
-are most interesting here, subpages enabled) is a little different
-than a first level section, we had a domain in the first level
-descriptor to get here, but now have direct access to four sets of
-AP bits you/I would have to read more to know what the difference
-is between the domain defined AP and these additional four, for now
-I dont care this is bare metal, set them to full access (0b11) and
-move on (see below about domain and ap bits).
-
-So lets take the virtual address 0x12345678 and the MMUTABLEBASE of
-0x4000 again.  The first level descriptor address is the top three
-bits of the virtual address 0x123, times 4, added to the MMUTABLEBASE
-0x448C.  But this time when we look it up we find a value in the
-table that has the lower two bits being 0b01.  Just to be crazy lets
-say that descriptor was 0xABCDE001  (ignoring the domain and other
-bits just talking address right now).  That means we take 0xABCDE000
-the picture shows bits 19:12 (0x45) of the virtual address (0x12345678)
-so the address to the second level descriptor in this crazy case is
-0xABCDE000+(0x45<<2) = 0xABCDE114  why is that crazy?  because I
-chose an address where we in theory dont have ram on the raspberry pi
-maybe a mirrored address space, but a sane address would have been
-somewhere close to the MMUTABLEBASE so we can keep the whole of the
-mmu tables in a confined area.  Used this address simply for
-demonstration purposes not based on a workable solution.
-
-The "other" bits in the descriptors are the domain, the TEX bits,
-the C and B bits, domain and AP.
-
-The C bit is the simplest one to start with that means Cacheable.  For
-peripherals we absolutely dont want them to be cached.  For ram, maybe.
-
-The b bit, means bufferable, as in write buffer.  Something you may
-not have heard about or thought about ever.  It is kind of like a cache
-on the write end of things instead of read end.   I digress, when
-a processor writes something everything is known, the address and
-data.  So the next level of logic, could, if so designed, accept
-that address and data at that level and release the processor to
-keep doing what it was doing (ideally fetch some more instructions
-and keep running) in parallel that logic could then continue to perform
-the write to the slower peripheral or really slow dram (or faster cache).
-Giving us a small to large performance gain.  But, what happens if while
-we are doing that first write another write happens.  Well if we only
-have storage for one transaction in this little feature then the
-processor has to wait for us to finish the first write however long
-that takes, then we can grab the information for the second write and
-then release the processor.  I call writes "fire and forget" because
-ideally the processor hands off the info to the memory controller
-and keeps going, the memory controller has all the info it needs to
-complete the task.  For a read the processor needs that data back so
-basically has to wait.  Well a write buffer can store up to some number
-of addresses and data.  It can still fill up and have to hold the
-processor off.  But it is similar to a cache is to reading, it has
-some faster ram that stages writes so the processor, sometimes, can
-keep on going.
-
-Now the TEX bits you just have to look up and there is the rub there
-are likely more than one set of tables for TEX C and B, I am going
-to stick with a TEX of 0b000 and not mess with any fancy features
-there.  Now depending on whether this is considered an older arm
-(ARMv5) or an ARMv6 or newer the combination of TEX, C and B have
-some subtle differences.  The cache bit in particular does enable
-or disable this space as cacheable.  That simply asserts bits on
-the AMDA/AXI (memory) bus that marks the transaction as cacheable,
-you still need a cache and need it setup and enabled for the
-transaction to actually get cached.  If you dont have the cache for
-that transaction type enabled then it just does a normal memory (or
-peripheral) operation.  So we set TEX to zeros to keep it out of the
-way.
-
-Lastly the domain and AP bits.  Now you will see a 4 bit domain thing
-and a 2 bit domain thing.  These are related.  There is a register in
-the MMU right next to the translation table base address register this
-one is a 32 bit register that contains 16 different domain definitions.
-
-The two bit domain controls are defined as such (these are AP bits)
-
-0b00 No access Any access generates a domain fault
-0b01 Client Accesses are checked against the access permission bits in the TLB entry
-0b10 Reserved Using this value has UNPREDICTABLE results
-0b11 Manager Accesses are not checked against the access permission bits in the TLB
-entry, so a permission fault cannot be generated
-
-For starters we are going to set all of the domains to 0b11 dont check
-cant fault.  What are these 16 domains though?  Notice it takes 4 bits
-to describe one of 16 things.  The different domains have no specific
-meaning other than that we can have 16 different definitions that we
-control for whatever reason.  You might allow for 16 different
-threads running at once in your operating system, or 16 different
-types of software running (kernel, application, ...) you can mark
-a bunch of sections as belonging to one parituclar domain, and with a
-simple change to that domain control register, a whole domain might
-go from one type of permission to another, from no checking to
-no access for example.  By just writing this domain register you can
-quickly change what address spaces have permission and which ones dont
-without necessarily changing the mmu table.
-
-Since I usually use the MMU in bare metal to enable data caching on ram
-I set my domain controls to 0b11, no checking and I simply make all
-the MMU sections domain number 0.
-
-So we end up with this simple function that allows us to add first level
-descriptors in the MMU translation table.
-
-unsigned int mmu_section ( unsigned int vadd, unsigned int padd, unsigned int flags )
-{
-    unsigned int ra;
-    unsigned int rb;
-    unsigned int rc;
-
-    ra=vadd>>20;
-    rb=MMUTABLEBASE|(ra<<2);
-    ra=padd>>20;
-    rc=(ra<<20)|flags|2;
-    PUT32(rb,rc);
-    return(0);
-}
-
-So what you have to do to turn on the MMU is to first figure out all
-the memory you are going to access, and make sure you have entries
-for that.  This is important, if you forget something, and dont have
-a valid entry there, then you fault, your fault handler, if you have
-chosen to write it, may also fault if it isnt placed write or something
-it accesses also faults...(I would assume the fault handler is also
-behind the mmu but would have to read up on that).
-
-So the smallest amount of ram on a raspi is 256MB or 0x10000000 bytes.
-
-Our program enters at address 0x8000, so that is within the first
-section 0x000xxxxx so we should make that section cacheable and
-bufferable.
-
-    mmu_section(0x00000000,0x00000000,0x0000|8|4);
-
-This is saying map the virtual 0x000xxxxx to the physical 0x000xxxxx
-enable the cache and write buffer. 0x8 is the C bit and 0x4 is the B
-bit.  tex, domain, etc are zeros.
-
-If we want to use all 256mb we would need to do this for all the
-sections from 0x000xxxxx to 0x100xxxxx.  Maybe do that later.
-
-We know that for the raspi1 the peripherals, uart and such are in
-arm physical space at 0x20xxxxxx.  To allow for more ram on the raspi 2
-they needed to move that and moved it to 0x3Fxxxxxx.  So we either need
-16 1MB section sized entries to cover that whole range or we look at
-specific sections for specific things we care to talk to and just add
-those.  The uart and the gpio it is associated with is in the 0x202xxxxx
-space.  There are a couple of timers in the 0x200xxxxx space so one
-entry can cover those.
-
-if we didnt want to allow those to be cached or write buffered then
-
-    mmu_section(0x20000000,0x20000000,0x0000); //NOT CACHED!
-    mmu_section(0x20200000,0x20200000,0x0000); //NOT CACHED!
-    mmu_section(0x3F000000,0x3F000000,0x0000); //NOT CACHED!
-    mmu_section(0x3F200000,0x3F200000,0x0000); //NOT CACHED!
-
-but we may play with that to demonstrate what caching a peripheral
-can do to you, why we need to turn on the mmu if for no other reason
-than to get some bare metal performance by using the d cache.
-
-Now you have to think on a system level here, there are a number
-of things in play.  We need to plan our memory space, where are we
-putting the MMU table, where are our peripherals, where is our program.
-
-If the only reason for using the mmu is to allow the use of the d cache
-then just map the whole world virtual = physical if you want with the
-peripherals not cached and the rest cached.
-
-If you are on the raspi 2 with multiple arm cores and are using
-the multiple arm cores you need to do more reading if you want one
-core to talk to another by sharing some of the memory between
-them.  Same problem as peripherals basically with multiple masters
-of the ram/peripheral on the far side of my cache, how do I insure
-what is in my cache maches the far side?  Easiest way is to not
-cache that space.  You need to read up on if the cores share a cache
-or have their own (or if l2 if present is shared but l1 is not),
-ldrex/strex were implemented specifically for multi core, but you
-need to understand the cache effects on these instructions (<grin>
-not documented well, I have an example on just this one topic).
-
-So once our tables are setup then we need to actually turn the
-MMU on.  Now I cant figure out where I got this from, and I have
-modified it in this repo.  According to this manual it was with the
-ARMv6 that we got the DSB feature which says wait for either cache
-or MMU to finish something before continuing.  In particular when
-initializing a cache to start it up you want to clean out all the
-entries in a safe way you dont want to evict them and hose memory
-you want to invalidate everything, mark it such that the cache lines
-are empty/available.  Likewise that little bit of TLB caching the MMU
-has, we want to invalidate that too so we dont start up the mmu
-with entries in there that dont match our entries.
-
-Why are we invalidating the cache in mmu init code?  Because first we
-need the mmu to use the d cache (to protect the peripherals from
-being cached) and second the controls that enable the mmu are in the
-same register as the i and d controls so it made sense to do both
-mmu and cache stuff in one function.
-
-So after the DSB we set our domain control bits, now in this example
-I have done something different, 15 of the 16 domains have the 0b11
-setting which is dont fault on anything, manager mode.  I set domain
-1 such that it has no access, so in the example I will change one
-of the descriptor table entries to use domain one, then I will access
-it and then see the access violation.  I am also programming both
-translation table base addresses even though we are using the N = 0
-mode and only one is needed.  Depends on which manual you read I guess
-as to whether or not you see the N = 0 and the separate or shared
-i and d mmu tables.  (the reason for two is if you want your i and
-d address spaces to be managed separately).
-
-Understand I have been running on ARMv6 systems without the DSB and it
-just works, so maybe that is dumb luck...
-
-This code relies on the caller to pass in the MMU enable and I and D
-cache enables.  This is because this is derived from code where
-sometimes I turn things on or dont turn things on and wanted it
-generic.
-
-
-.globl start_MMU
-start_MMU:
-    mov r2,#0
-    mcr p15,0,r2,c7,c7,0 ;@ invalidate caches
-    mcr p15,0,r2,c8,c7,0 ;@ invalidate tlb
-    mcr p15,0,r2,c7,c10,4 ;@ DSB ??
-
-    mvn r2,#0
-    bic r2,#0xC
-    mcr p15,0,r2,c3,c0,0 ;@ domain
-
-    mcr p15,0,r0,c2,c0,0 ;@ tlb base
-    mcr p15,0,r0,c2,c0,1 ;@ tlb base
-
-    mrc p15,0,r2,c1,c0,0
-    orr r2,r2,r1
-    mcr p15,0,r2,c1,c0,0
-
-    bx lr
-
-I am going to mess with the translation tables after the MMU is started
-so the easiest way to deal with the TLB cache is to invalidate it, but
-dont need to mess with main L1 cache.  ARMv6 introduces a feature to
-help with this, but going with this solution.
-
-.globl invalidate_tlbs
-invalidate_tlbs:
-    mov r2,#0
-    mcr p15,0,r2,c8,c7,0  ;@ invalidate tlb
-    mcr p15,0,r2,c7,c10,4 ;@ DSB ??
-    bx lr
-
-Something to note here.  Debugging using the JTAG based on chip debugger
-makes life easier, that removing sd cards or the old days pulling an
-eeprom out and putting it it in an eraser then a programmer.  BUT,
-it is not completely without issue.  When and where and if you hit this
-depends heavily on the core you are using and the jtag tools and the
-commands you remember/prefer.  The basic problem is caches can and
-often do separate instruction I fetches from data D reads and writes.
-So if you have test run A of a program that has executed the instruction
-at address 0xD000.  So that instruction is in the I cache.  You have
-also executed the instruction at 0xC000 but it has been evicted, but
-you dont actually know what is in the I cache or not, shouldnt even
-try to assume.  You stop the processor, you write a new program to
-memory, now these are data D writes, and go through the D cache.  Then
-you set the start address and run again.  Now there are a number of
-combinations here and only one if them works, the rest can lead to
-failure.
-
-For each instruction/address in the program, if the prior instruction
-at that address was in the i cache, and since data writes do not go
-through the i cache then the new instruction for that address is either
-in the d cache or in main ram.  When you run the new program you will
-get the stale/old instruction from a prior run when you fetch that
-address (unless an invalidate happens, if a flush happens then you
-write back, but why would an I cache flush?), and if the new instruction
-at that address is not the same as the old one unpredictable results
-will occur.  You can start to see the combinations, did the data
-write go through to d cache or to ram, will it flush to ram and is the
-i cache invalid for that address, etc.
-
-There is also the quesiton of are the I and D caches shared, they can
-be but that is both specific to the core and your setup.  Also does
-the jtag debugger have the ability to disable the caches, has it done
-it for you, can you do it manually.
-
-Any time you are using the i or d caches you need to be careful using
-a jtag debugger or even a bootloader type approach depending on its
-design as you might end up doing data writes of instructions and going
-around the i cache or worse.  So for this kind of work using a chip
-reset and non volitle rom/flash based bootloader can/will save you
-a lot of headaches.  If you know your debugger is solving this for you,
-great, but always make sure as you change from the raspi 2 back to
-a raspi 1 for example it might not be doing it and it will drive you
-nuts when you keep downloading a new program and it either crashes
-in a strange way or simply just keeps running the old program and
-not appearing to take your new changes.
-
-So the example is going to start with the mmu off and write to
-addresses in four different 1MB address spaces.  So that later we
-can play with the section descriptors and demonstrate virtual to
-physical address conversion.
-
-So write some stuff and print it out on the uart.
-
-    PUT32(0x00045678,0x00045678);
-    PUT32(0x00145678,0x00145678);
-    PUT32(0x00245678,0x00245678);
-    PUT32(0x00345678,0x00345678);
-
-    hexstring(GET32(0x00045678));
-    hexstring(GET32(0x00145678));
-    hexstring(GET32(0x00245678));
-    hexstring(GET32(0x00345678));
-    uart_send(0x0D); uart_send(0x0A);
-
-then setup the mmu with at least those four sections and the peripherals
-
-    mmu_section(0x00000000,0x00000000,0x0000|8|4);
-    mmu_section(0x00100000,0x00100000,0x0000);
-    mmu_section(0x00200000,0x00200000,0x0000);
-    mmu_section(0x00300000,0x00300000,0x0000);
-    //peripherals
-    mmu_section(0x20000000,0x20000000,0x0000); //NOT CACHED!
-    mmu_section(0x20200000,0x20200000,0x0000); //NOT CACHED!
-
-and start the mmu with the I and D caches enabled
-
-    start_mmu(MMUTABLEBASE,0x00000001|0x1000|0x0004);
-
-then if we read those four addresses again we get the same output
-as before since we maped virtual = physical.
-
-    hexstring(GET32(0x00045678));
-    hexstring(GET32(0x00145678));
-    hexstring(GET32(0x00245678));
-    hexstring(GET32(0x00345678));
-    uart_send(0x0D); uart_send(0x0A);
-
-but what if we swizzle things around.  make virtual 0x001xxxxx =
-physical 0x003xxxxx.  0x002 looks at 0x000 and 0x003 looks at 0x001
-(dont mess with the 0x00000000 section, that is where our program is
-running)
-
-    mmu_section(0x00100000,0x00300000,0x0000);
-    mmu_section(0x00200000,0x00000000,0x0000);
-    mmu_section(0x00300000,0x00100000,0x0000);
-
-and maybe we dont need to do this but do it anyway just in case
-
-    invalidate_tlbs();
-
-read them again.
-
-    hexstring(GET32(0x00045678));
-    hexstring(GET32(0x00145678));
-    hexstring(GET32(0x00245678));
-    hexstring(GET32(0x00345678));
-    uart_send(0x0D); uart_send(0x0A);
-
-the 0x000xxxxx entry was not modifed so we get 000045678 as the output
-but the 0x001xxxxx read is now coming from physical 0x003xxxxx so we
-get the 00345678 output, 0x002xxxxx comes from the 0x000xxxxx space
-so that read gives 00045678 and the 0x003xxxxx is mapped to 0x001xxxxx
-physical giving 00145678 as the output.
-
-So up to this point the output looks like this.
-
-DEADBEEF
-00045678
-00145678
-00245678
-00345678
-
-00045678
-00145678
-00245678
-00345678
-
-00045678
-00345678
-00045678
-00145678
-
-first blob is without the mmu enabled, second with the mmu but
-virtual = physical, third we use the mmu to show virtual != physical
-for some ranges.
-
-Now for some small pages, I made this function to help out.
-
-unsigned int mmu_small ( unsigned int vadd, unsigned int padd, unsigned int flags, unsigned int mmubase )
-{
-    unsigned int ra;
-    unsigned int rb;
-    unsigned int rc;
-
-    ra=vadd>>20;
-    rb=MMUTABLEBASE|(ra<<2);
-    rc=(mmubase&0xFFFFFC00)/*|(domain<<5)*/|1;
-    //hexstrings(rb); hexstring(rc);
-    PUT32(rb,rc); //first level descriptor
-    ra=(vadd>>12)&0xFF;
-    rb=(mmubase&0xFFFFFC00)|(ra<<2);
-    rc=(padd&0xFFFFF000)|(0xFF0)|flags|2;
-    //hexstrings(rb); hexstring(rc);
-    PUT32(rb,rc); //second level descriptor
-    return(0);
-}
-
-So before turning on the mmu some physical addresses were written
-with some data.  The function takes the virtual, physical, flags and
-where you want the secondary table to be.  Remember secondary tables
-can be up to 1K in size and are aligned on a 1K boundary.
-
-
-    mmu_small(0x0AA45000,0x00145000,0,0x00000400);
-    mmu_small(0x0BB45000,0x00245000,0,0x00000800);
-    mmu_small(0x0CC45000,0x00345000,0,0x00000C00);
-    mmu_small(0x0DD45000,0x00345000,0,0x00001000);
-    mmu_small(0x0DD46000,0x00146000,0,0x00001000);
-    //put these back
-    mmu_section(0x00100000,0x00100000,0x0000);
-    mmu_section(0x00200000,0x00200000,0x0000);
-    mmu_section(0x00300000,0x00300000,0x0000);
-    invalidate_tlbs();
-
-Now why did I use different secondary table addresses most of the
-time but not all of the time?  A secondary table lookup is the same
-first level descriptor for the top 12 bits of the address, if the
-top 12 bits of the address are different it is a different secondary
-table.  So to demonstrate that we actually have separation within a
-section I have two small pages within a 1MB section that I point
-at two different physical address spaces.  So in short if the top
-12 bits of the virtual address are the same then they share the same
-coarse page table, the way the function works it writes both first
-and second level descriptors so if you were to do this
-
-    mmu_small(0x0DD45000,0x00345000,0,0x00001000);
-    mmu_small(0x0DD46000,0x00146000,0,0x00001400);
-
-Then both of those virtual addresses would go to the 0x1400 table, and
-the first virtual address would not have a secondary entry its
-secondary entry would be in a table at 0x1000 but the first level
-no longer points to 0x1000 so the mmu would get whatever it finds
-in the 0x1400 table.    
-
-
-The last example is just demonstrating an access violation.  Changing
-the domain to that one domain we did not set full access to
-
-    //access violation.
-
-    mmu_section(0x00100000,0x00100000,0x0020);
-    invalidate_tlbs();
-
-    hexstring(GET32(0x00045678));
-    hexstring(GET32(0x00145678));
-    hexstring(GET32(0x00245678));
-    hexstring(GET32(0x00345678));
-    uart_send(0x0D); uart_send(0x0A);
-
-The first 0x45678 read comes from that first level descriptor, with
-that domain
-
-00045678
-00000010
-
-How do I know what that means with that output.  Well from my blinker07
-example we touched on exceptions (interrupts).  I made a generic test
-fixture such that anything other than a reset prints something out
-and then hangs.   In no way shape or form is this a complete handler
-but what it does show is that it is the exception that is at address
-0x00000010 that gets hit which is data abort.  So figuring out it was
-a data abort (pretty much expected) have that then read the data fault
-status registers, being a data access we expect the data/combined one
-to show somthing and the instruction one to not.  Adding that
-instrumentation resulted in.
-
-00045678
-00000010
-00000019
-00000000
-00008110
-E5900000
-00145678
-
-Now I switched to the ARM1176JZF-S Technical Reference Manual for more
-detail and that shows the 0x01 was domain 1, the domain we used for
-that access. then the 0x9 means Domain Section Fault.
-
-The lr during the abort shows us the instruction, which you would need
-to disassemble to figure out the address, or at least that is one
-way to do it perhaps there is a status register for that.
-
-The instruction and the address match our expectations for this fault.
-
-This is simply a basic intro.  Just enough to be dangerous.  The MMU
-is one of the simplest peripherals to program so long as bit
-manipulation is not something that causes you to lose sleep.  What makes
-it hard is that if you mess up even one bit, or forget even one thing
-you can crash in spectacular ways (often silently without any way of
-knowing what happened).  Debugging can be hard at best.
-
-The ARM ARM indicates that the ARMv6 adds the feature of separating
-the I and D from an mmu perspective which is an interesting thought
-(see the jtag debugging comments, and think about how this can affect
-you re-loading a program into ram and running) you have enough ammo
-to try that.  The ARMv7 doesnt seem to have a legacy mode yet, still
-reading, the descriptors and how they are addresses looks basically
-the same but this code doesnt yet work on the raspi 2, so I will
-continue to work on that and update this repo when I figure it out.
-
-
-
-
-
diff --git a/mmu/README.md b/mmu/README.md
new file mode 100644
index 0000000..8b66fcd
--- /dev/null
+++ b/mmu/README.md
@@ -0,0 +1,886 @@
+# MMU on Raspberry Pi
+
+See the top level README file for more information on documentation
+and how to run these programs.
+
+## Preface
+
+This example demonstrates ARM MMU basics.
+
+You will need the [ARM ARM] (ARM Architectural Reference Manual) for
+ARMv5.
+
+This code so far does not work on the Raspberry pi 2 yet, will get
+that working at some point, the knowledge here still applies, I expect
+the differences to be subtle between ARMv6 and 7 but will see.
+
+## Fundamentals
+
+A Memory Managment Unit (MMU) translates virtual addresses into physical 
+addresses, as well as checking access permissions and giving control over 
+marking regions cacheable. This allows the programmer to identify the memory 
+regions which may be cached for faster access in the CPU core, but leaving out 
+for example hardware registers, which are mapped into memory. 
+
+There is a boundary inside the chip around the ARM-core. The ARM-core itself 
+uses virtual addresses for memory and hardware-accesses, which are translated 
+by the MMU to physical addresses when actually requesting a value in memory. 
+Every access to the memory or the "world side" has to go through the MMU. 
+
+When the ARM-core powers up the MMU is disable, which means that every access 
+will pass through unmodified, making the virtual addresses (processor side) 
+equal to the physical addresses (world side). All of the example thus far in 
+this repository (e.g. blinkers) are based in physical addresses. 
+
+We already know that somewhere else in the chip the used addresses are 
+different. The Raspberry Pi manual is written for 0x7Exxxxxx based addresses, 
+but for the ARM's physical addresses for the same things is 0x20xxxxxx for the 
+Raspberry Pi 2 and 0x3Fxxxxxx for the Raspberry Pi 2. For this discussion we 
+only care about the ARM and the ARM MMU, not for the other mystical translation 
+on the chip.
+
+### Motivation
+
+Let's say I am programming a program for let's say Linux. I would have to link 
+my program to use specific addresses (or a specific address space). Let's 
+assume, that our program is loaded into 0x8000 and it can use the memory from 
+0x0000 onwards. That would be fine for one program, but let's say another 
+program wants to be loaded to 0x8000 or maybe use this space as memory for 
+variables. So how can we run several program without the risk of them 
+clobbering each other?
+
+The answer is neither is actually loaded into 0x8000 when running. The programs 
+may assume, that they can use the addresses like stated above, but in reality 
+the addresses when requesting memory cells will be translated by the MMU. So 
+one program could be placed to 0x10008000, the other one at 0x20008000. When 
+program 1 thinks it accesses 0x0000abcd it is really accessing 0x1000abcd, the 
+other one 0x2000abcd. This translation is completely transparent to the 
+programs, i.e. they will never notice, that the addresses are translated for 
+them.
+
+Theoretically you could assign every virtual address a physical address to be 
+translated to, but that does not make much sense. The ARM-core used on the 
+Raspberry Pi is a 32-bit processor, i.e. it uses 32-bit addresses. This means 
+we have 4 Giga (2^32) addresses. A table containing the physical addresses 
+alone would be 16 GB big. 
+
+The ARM has one option to translate the top 12 bits of the virtual address to 
+the top 12 bits of the physical address, leaving the lower 20 bits as they are 
+between the virtual and physical space. This means we can control 1 MB of 
+address space per definition and have 4096 entries in a table somewhere to 
+convert virtual to physical addresses. The ARM still uses all 32 Bits, 12 for 
+the top 12 address bits, the other ones as control flags. One of them indicates 
+whether a region is marked as cacheable. 
+
+### About caching
+
+A cache is a (very small but) very fast memory inside the processor. It is used 
+by the processor transparently to remember data which is loaded and/or stored 
+by your program together with its address. This behaviour saves the processor 
+from having to request the value from RAM every time it is needed, having to 
+wait for the (slower) memory on every read/write. Caching can vastly increase 
+the speed of your program. Changes to values are written through the cache.
+
+But why is it disabled with the MMU disabled? Let's assume we want to read the 
+value register of a timer. This is done by reading from a specific address. 
+What we want would be, that we get the current value every time we read the 
+register. When caching is enabled for these memory regions we would read the 
+current value one time, but after that we will only get the cached value. This 
+is no good, because you cannot control peripherals if you are unable to get the 
+current state or value of a peripheral, because the cache only gives you the 
+last (old) values. 
+
+Likewise, if you have some RAM, which is shared by more than one resource, like 
+the GPU and the ARM or several processor cores on the Raspberry Pi 2 or 3, you 
+will have a similar situation. In general you want to disable caching on every 
+region which can be modified by other means than through the cache. The MMU 
+let's you enable or disable caching on memory regions.
+
+### About access permissions
+
+Let's think back to our example with the two programs running "at the same 
+time". You don't want any of the programs to get access to the operating 
+systems data structures nor do you want any of the two modifying code or data 
+of the other program. You would not want a video game to get access to your 
+banking account open in another window, would you? 
+
+The mechanisms vary by processor family but the MMU provides the security 
+mechanisms. When a particular program is running on the processor there are 
+mean to allow or restrict access to specific memory spaces. If some code 
+accesses an address it does not have the permissions for, then a Data 
+Abort-Exception happens, and the processor will stop running the code of that 
+application. The Operating System will be notified (by the means of an 
+Exception Handler / Interrupt Service Routine). 
+
+This Data Abort does not have to be fatal for the application, but it could be 
+by design. Think of a virtual machine, running on the processor and when it 
+tries to access its peripherals, the real Operating System can be notified to 
+simulate the peripheral and keep the virtual machine running. 
+
+### About virtual memory
+
+What happens when you run out of memory on your computer? Let's say the RAM is 
+use up completely, but an application uses for example `malloc` to request more 
+memory. The operating system will then find a block of memory of another 
+application and save that to disk. This space can then be used by the running 
+application as memory. When the other program then tries to use the swapped out 
+memory, it will trigger an Data Abort-Exception in the processor. This will 
+trigger the operating system to swap that memory block back into memory (maybe 
+substituting another block of another application).
+
+The term swap comes from the idea that these blocks of ram are swapped back and 
+forth to disk, program A's ram goes to disk and is swapped with program T's, 
+then program T's is swapped with program K's and so on. This is why starting 
+right after you venture off that edge from real RAM to virtual, your computers 
+performance drops dramatically and disk activity goes way up. The more things 
+run the more memory needs to be swapped onto the much slower disk.
+
+## Wading through the documentation 
+
+I am mostly using the ARMv5 Architectural Reference Manual DDI0100I. ([ARM ARM])
+
+Unfortunately the ARM ARM does not look the same from one to the next. With the 
+Raspberry Pi 1 and Zero we are technically using an ARMv6 (architecture version 
+6), but when we go to ARM's website, there is an ARMv5, ARMv7 and ARMv8-version, 
+but no ARMv6. The ARMv5 manual is actually the original ARM ARM, where they (I 
+assume) realized, that they could not maintain all the architecture variations 
+in one document forever. So they split them per revision. With respect to the 
+MMU the ARMv5 manual cover the ARMv4, ARMv5 and ARMv6. There is a mode where 
+you can have the same code and table to work on all three, so you don't have to 
+if-then-else your code based on whatever architecture you find. This example is 
+based on this legacy mode with subpages enables.
+
+The 1 MB sections mentioned above are called sections. The ARM MMU also has 
+blobs with a smaller size of 4096 bytes, which are called small page. I will 
+touch on those two sizes.
+
+As mentioned above the Raspberry Pi has a 32 bit address space. 1 MB sections 
+means 20 bits unaltered (bits 32 to 20) and 12 bits translated meaning 4096 
+1 MB sections, i.e. 4096 entries in the table. The top 12 bits of the virtual 
+address get translated to the top 12 bits of the physical address. There are no 
+additional rules on the translation, you can have for example
+- virtual = physical
+- any combination you like
+- have a bunch of virtual sections point to the same physical space.
+
+ARM uses the term Virtual Memory System Architecture or VMSA and
+they say things like VMSAv6 to talk about the ARMv6 VMSA. There
+is a section in the ARM ARM titled Virtual Memory System Architecture.
+In there we see the coprocessor registers, specifically CP15 register
+2 is the translation table base register. In the CP15 subsection register 2 is 
+the translation table base register. There are three opcodes which give us 
+access to three things: `TTBR0`, `TTBR1` and the control register.
+
+### Writing the Translation table base address
+
+First we read this comment (pg. 741, heading: Register 2: Translation table 
+base):
+> If N = 0 always use `TTBR0`. When N = 0 (the reset case), the translation
+> table base is backwards compatible with earlier versions of the
+> architecture.
+
+So we want to leave N = 0 and use `TTBR0`.
+
+The `TTBR0`-register contains the base address in the physical address space. 
+The bits 31 down to 14-n (with n=0 in our case) are used as the base address. 
+Note that the MMU cannot go through the MMU to figure out how to go through the 
+MMU. It operates exclusively in physical address space and has direct access to 
+memory. In a second we are going to see, that the base address for the MMU table 
+has to be aligned to 16384 bytes (2^14), the lower 14 bits of our TLB base 
+address is all zeroes (TLB=Translation Look-Aside Buffer)). 
+
+We write that register using
+
+```c
+    mcr p15,0,r0,c2,c0,0 ;@ tlb base
+```
+
+#### The co-processor
+
+Let me explain what that mnemonic does. `mcr` is a special instruction to write 
+to registers of the co-processor. This co-processor manages loads of functions 
+of the ARM-core, like unaligned data access or the MMU. You probably already 
+came across `msr` which is an instruction to store data into the status 
+register of the ARM-core (for example to set a new privilege mode or to enable 
+interrupts). You cannot access the status register or the co-processor 
+registers with the normal `mov` instruction.
+
+The co-processor has several registers, which can be accessed by the `mcr` or 
+the `mrc` instruction. The parameters are:
+
+```c
+    MCR{cond} P15,<Opcode_1>,<Rd>,<CRn>,<CRm>,<Opcode_2>
+    MRC{cond} P15,<Opcode_1>,<Rd>,<CRn>,<CRm>,<Opcode_2>
+```
+
+[co-processor] shows the list of registers plus their assignment to the 
+parameters for the two instructions. So the above statement will access the following parameters:
+- Opcode_1: 0
+- Rd: r0
+- CRn: c2 (register number within CP15)
+- CRm: c0 (operational register)
+- Opcode_2: 0
+
+r0 serves as source register of the value to be written to the Register 
+identified by (c2,c0,0,0).
+
+#### About TLB
+
+As far as we are concerned think of the TLB as an array of 32 bit integer, 
+each one being used to translate a virtual to a physical address and 
+describes permissions and caching. My example is going to have a define called 
+`MMUTABLEBASE` which will be where out TLB table starts. The TLB is used as 
+cache for the page tables.
+
+The MMU is completely realised in hardware, but you can configure it the way 
+you want. It will operate on the values we set into our page table with Or and 
+And-operations (i.e. bit-manipulations). It uses portions of the virtual 
+addresses to find the correct plane in the page table to find the according 
+physical address. From the next bits it will decide what to do next. We, as 
+programmers, need to know how the MMU calculates the place, so we can put our 
+descriptor into the correct space, so the MMU finds it.
+
+#### Translating virtual to physical addresses
+
+In the manual there is figure B4-4 (page B4-29), which shows a diagram of how 
+the addresses are translated. It uses X instead of N (which we want to be 0). 
+The modified virtual address in this diagram is, as far as we are concerned, 
+unmodified as we want to intend to use our virtual addresses. The first thing 
+we see, is that the lower 14 Bits of the translation table base (in my example 
+`MMUTABLEBASE`), i.e. the start address of the translation table are marked as 
+SBZ, i.e. should-be-zero. This means, that the translation table should be 
+aligned to 16 KiB (2^14 Bytes). Using 0x0000 as starting address would not make 
+much sense, as this is the place for the interrupt vector. The next good place 
+would be 0x00004000. Adding another 16 KiB to that address is (not a 
+coincidence) 0x8000, where we put our code. But any other address, which is 
+aligned to 16 KiB should work, as long as you have enough memory there and not 
+clobber anything else. 
+
+The figure B4-4 shows, that we take the top 12 bits of the virtual address, 
+multiply by 4 (or shift left by 2) and add that to the translation table base, 
+which gives the address of the first level descriptor for that virtual address. 
+A multiplication by 4 is no coincidence, but rather takes the length of each 
+descriptor into account (which is exactly 32 bit, or 4 byte). The descriptor 
+is fetched and interpreted. As long as we leave N=0 the MMU will always look 
+into the first 12 bits, which replace the first 12 bits of the virtual address. 
+The last two bits of the descriptor are flags, if they are 0b10, then it is a 
+1 MB section. If it's something different, then a second level translation 
+will be triggered, but for now let's focus on the simpler part.
+
+##### An example
+
+```
+    MMUTABLEBASE                    = 0x00004000
+    virtual address                 = 0x12345678
+    -> first 12 bit (moved to right): 0x00000123
+    -> multiplied by 4:               0x0000048c
+    descriptor for the section:       0x0000448c
+    let's assume the descriptor was   0xABC00002
+    -> physical address               0xABC45678
+```
+
+#### 16 MiB Supersections
+
+So the ARM ARM states, that you can have 16 MiB supersections. This would make 
+life easier, right? Well, no. You still have to generate 16 descriptors for 
+each of the possible 1 MiB sections, so you might as well make them 1 MiB big. 
+You can read up on the differences and try the supersections our, but I'm going 
+to use 1 MiB sections for now.
+
+Maybe you figured out a bit of a problem here. Every load and store with the MMU
+enabled requires at least one MMU table lookup. The MMU memory accesses of 
+course don't have to go through the MMU, but every other store or load. This
+does have a performance hit. Therefore the MMU caches the last TLB-lookups. 
+This helps but the conversion has to be done on every requested address. 
+
+### Descriptor format
+
+I am looking on the subsection about First Level Descriptors in the ARM ARM,
+especially the Table B4-1 (pg. B4-27, First-level descriptor format (VMSAv6, 
+subpages enabled)). 
+
+This table identifies four different sets of last two bits [1:0]:
+- `0b00` - this section is unmapped. Attempting to access these addresses will generate a translation fault (Data Abort). The bits [31:2] are ignored by the hardware, although it is recommended to keep valid permissions for the descriptor there.
+- `0b01` - for coarse second level table, second level lookup required for translation; allows more fine grained sectioning of the section
+- `0b10` - sections descriptor for its associated virtual addresses, no second level lookup
+- `0b11` - reserved in VMSAv6
+
+For now let's work with the `0b10`-entries. The format of the entry is as follows:
+
+|   | Bits 31:20 | Bits 19:15 | 14,13,12 | 11,10 | 9 | 8:5 | 4 | 3 | 2 | 1,0 |
+|---|------------|------------|----------|-------|---|-----|---|---|---|-----| 
+| Section | Section base address | SBZ | TEX | AP | IMP | Domain | SBZ | C | B | `10` |
+
+The **section base address** are the 12 top bits of the physical address, which 
+substitute the top 12 bits of the virtual address. The **C** bit marks the address 
+region as cacheable. We do absolute not want to cache peripheral regions, 
+RAM-regions maybe. The C-flag simply asserts bits on the AMDA/AXI (memory) bus 
+that marks the transaction as cacheable, you still need a cache setup and 
+enabled for the transaction to actually get cached. If you don't have the 
+cache for that transaction type enabled, then it just does a normal memory (or
+peripheral) operation.
+
+The **B** bit, means bufferable, as in write buffer. This enables a "cache" 
+but for writing instead of reading. When writing a value to RAM (or peripheral) 
+everything is known, the data and the address. The buffer-bit allows some logic 
+at this level to accept the value and address and continue to write the data to 
+the slower RAM or peripheral (or cache) and let's the CPU go on executing it's 
+program. This may give us a performance boost. When a second write appears and 
+we only have a single place for a transaction, the processor gets stalled until 
+the first one is complete and the second write-command can be saved to the 
+buffer. The advantage is that for a number of writes the processor can hand the 
+needed data to the memory controller and carry on.
+
+You need to look up the **TEX** bits yourself. I will stick to them being 0b000 
+and will not mess with any fancy features here. The combinations of TEX, C and 
+B bits make some subtle differences, look them up in Table B4-3 (CB + TEX 
+Encodings). 
+
+The **AP** bits indicate the level of access permissions (see Table B4-1 MMU 
+access permissions, pg. B4-9), for page table formats, which don't support APX, 
+value 0 is assumed. The following AP-values are therefore valid:
+- `0b00` - No access for anyone; will generate permission fault on every access
+- `0b01` - Read/Write permission for privileged mode 
+- `0b10` - Read/write for privileged mode, read for user mode (writes in user mode trigger permission faults)
+- `0b11` - Full access (R/W for everyone)
+
+The **domain** is a bit trickier to explain. There is a register right next to the translation table base address register which contains 16 different domain specifications. These definitions are 2 bit long each:
+- `0b00` - **no access**, any access generates a domain fault
+- `0b01` - **client**, accesses are checked against the access permission bits in the TLB entry
+- `0b10` - **UNPREDICTABLE** behaviour
+- `0b11` - **manager**, accesses are not checked and cannot generate a permission fault
+
+The domains basically are 16 different definitions which control the behaviour 
+on access. We can define for example 16 types of applications and assign them 
+sections. We assign sections to domains by setting the four **domain** bits of 
+the translation table entry to the number of the definition in the register. 
+With changing to bits in this register we can then put sections of a domain 
+into another permission mode, which is quite useful, because we don't need to 
+change the MMU table.
+
+For starters we are going to set all of the domains to `0b11` don't check and 
+all of our sections can have the domain number 0. 
+
+## A simple implementation
+
+```c
+    /**
+     * \brief creates an translation table entry (for sections of size 1 MiB)
+     * \param[in] virtual the virtual address (only top 12 bits used)
+     * \param[in] physical the physical address (only top 12 bits used)
+     * \param[in] flags the flags for the section
+     **/
+    uint32_t mmu_section ( uint32_t virtual, uint32_t physical, uint32_t flags )
+    {
+        uint32_t offset = virtual >> 20;
+        // plus and or are the same thing here, as MMUTABLEBASE is 14 bit aligned
+        uint32_t* entry = MMUTABLEBASE | (offset<<2);
+        
+        // mask lower 20 bits of physical address then ORR flags and 0x02 for 1 MiB
+        uint32_t physval = (physical & 0xfff00000) | (flags & 0x7ffc) | 0x02; 
+
+        *entry = physval;
+        return(0);
+    }
+
+    #define CACHEABLE 0x08
+    #define BUFFERABLE 0x04
+```
+
+### Filling the table with sections
+
+Before enabling the MMU itself we need to make sure, that every section of 
+memory we want to use is defined with a valid entry in the table. If not, 
+access to that region will trigger a fault handler - if you decide to write 
+one. Which in turn can access to non mapped memory - which is not good. 
+
+The smallest amount of RAM on a Raspberry Pi is 256 MiB or 0x10000000 bytes. 
+Our program enters at address 0x8000, so that is within the first
+section 0x000xxxxx so we should make that section cacheable and
+bufferable.
+
+```c
+    mmu_section( 0x00000000,0x00000000, CACHEABLE | BUFFERABLE );
+```
+
+This statement will create an entry for the virtual address space 0x000xxxxx 
+to the physical addresses 0x000xxxxx enable the cache and write buffer. If we 
+want to use all 256mb we would need to do this for all the sections from 
+0x000xxxxx to 0x100xxxxx.
+
+
+
+We know that for the Raspberry Pi 1 the peripherals, like AUX / UART and such 
+are in ARM physical space at 0x20xxxxxx. To allow for more RAM on the Raspberry 
+Pi 2 they needed to that peripheral base address and moved it to 0x3Fxxxxxx. 
+We can either create 16 1 MiB section entries to cover the whole range of 
+peripherals or we only define the sections we care to talk to. The UART and the 
+GPIO are associated with the 0x202xxxxx space. There are a couple of timers 
+in the 0x200xxxxx space so one entry can cover those.
+
+``` c
+    mmu_section(0x20000000,0x20000000,0x0000); //NOT CACHED!
+    mmu_section(0x20200000,0x20200000,0x0000); //NOT CACHED!
+    mmu_section(0x3F000000,0x3F000000,0x0000); //NOT CACHED!
+    mmu_section(0x3F200000,0x3F200000,0x0000); //NOT CACHED!
+```
+
+These sections are not cached and not buffered, but but we may play with that 
+to demonstrate what caching a peripheral can do to you, why we need to turn on 
+the MMU if for no other reason than to get some bare metal performance by 
+using the cache.
+
+Now you have to think on a system level here, there are a number
+of things in play. We need to plan our memory space, where are we
+putting the MMU table, where are our peripherals, where is our program.
+
+If the only reason for using the MMU is to allow the use of the cache
+then just map the whole world 1:1; if you want with the peripherals not cached 
+and the rest cached.
+
+### Cache invalidation
+
+So once our tables are setup then we need to actually turn the
+MMU on. When initialising a cache to you want to clean out all the entries in a 
+safe way. You want to invalidate everyhting, mark every cache line as empty / 
+available. Likewise you want the TLB caching the MU does to be invalidated, 
+so the MMU starts up with no valid lines in the cache, that don't match our 
+entries. Also we want the CPU to do a Data Synchronization Barrier (DSB), so 
+every explicit memory transaction is finished before the next instruction begins. 
+
+All of the above can be done using the C15 [co-processor]. So, to summarise:
+1. Invalidate all caches (Instruction and data, write 0 to `0, c7, c7, 0`)
+2. Invalidate the TLB entries (write 0 to `0, c8, c7, 0`)
+3. Data synchronisation barrier (write 0 to `0, c7, c10, 4`)
+4. Set the domain access controls (write 0xffffffff to `0, c3, c0, 0`, 0b11 for every domain)
+5. Set the base address for the translation table (`0,c2,c0,0`)
+6. Enable level 1 caches and the MMU in the control register (`0,c1,c0,0`) and some other useful things:
+  - bit 0 (M) enables MMU
+  - bit 2 (C) enables level 1 data cache
+  - bit 11 (Z) enables branch prediction
+  - bit 12 (I) enables instruction cache
+  - bit 22 (U) enables non-aligned data access as well as mixed big-/little-endian data access
+
+What bits of these you want to set is up to you. I would recommend M, C and I, 
+but I am going ahead and set them all. So simple example code implementing the 
+the MMU-enabling process might be this: 
+
+```c
+    .global mmu_init
+    mmu_init:
+        mov r1,#0
+        // invalidate caches
+        mcr p15,0,r1,c7,c7,0 
+        // invalidate TLB entries
+        mcr p15,0,r1,c8,c7,0 
+        // data synchronisation barrier
+        mcr p15,0,r1,c7,c10,4 
+        
+        // set all domains to 0b11
+        ldr r1, =0xffffffff
+        mcr p15,0,r1,c3,c0,0
+        
+        // set the translation table base address (remember to align 16 KiB!)
+        mcr p15,0,r0,c2,c0,0
+        
+        // set the bits mentioned above
+        ldr r1, =0x00401805
+        mrc p15,0,r2,c1,c0,0
+        orr r2,r2,r1
+        mcr p15,0,r2,c1,c0,0
+        
+        mov pc, lr
+```
+
+For messing with the translation tables after the MMU is started, you will need
+to invalidate the TLB cache again, so let's put this part into its own function.
+We don't need to care about the L1 cache, this time. Also ARMv6 introduces a 
+feature to help with invlidating the TLB, but I'm going with this solution:
+
+```c
+    .globl tlb_invalidate
+    tlb_invalidate:
+        mov r2,#0
+        // invalidate TLB entries
+        mcr p15,0,r1,c8,c7,0 
+        // data synchronisation barrier
+        mcr p15,0,r1,c7,c10,4 
+        mov pc,lr
+```
+
+#### (JTAG) Debugging and caching
+
+Something to note here. Debugging using the JTAG based on-chip-debugger
+makes life easier. No SD-card swapping and no more EEPROM-flashing. BUT,
+it is not completely without issue. The basic problem is that caches often 
+seperate instruction fetches from data reads and writes. Let's say you execute 
+instructions at 0xD0000 (which is cached) and an instruction 0xC000. So you
+transfer your programm, set the start address and run again. 
+
+For each instruction in the program the prior instruction in that address might 
+still be in the instruction cache and the new one in main RAM (or data cache). 
+So, when running the new program you might still be running the old 
+instructions, which are fetched back from the instruction cache, not the RAM, 
+unless an invalidate or flush happens).
+
+There is also the question of are the instruction and data caches shared? 
+They can be specific to the core and your setup. Is your JTAG-debugger able to 
+disable the caches, has it done that for you, or can you do it manually.
+
+Any time you are using the instruction or data caches you need to be careful 
+using a JTAG-debugger or even a bootloader type approach depending on its
+design as you might end up doing data writes of instructions and going
+around the instruction cache or worse. This may be done by your JTAG debugger,
+but keep in mind to change back to / from Raspberry Pi 2 when switching between 
+the Pis. Otherwise this might driver you mad, when you keep downloading new 
+code but the Pi crashes or behaves unexpectedly.
+
+## Having fun with address translation
+
+So the example is going to start with the MMU off and write to
+addresses in four different 1MB address spaces, so we can play with the section 
+descriptors and demonstrate virtual to physical address conversion later.
+
+```c
+    // write data to four different 1 MiB sections
+    PUT32(0x00045678,0x00045678);
+    PUT32(0x00145678,0x00145678);
+    PUT32(0x00245678,0x00245678);
+    PUT32(0x00345678,0x00345678);
+
+    // write the data back to UART
+    hexstring(GET32(0x00045678));
+    hexstring(GET32(0x00145678));
+    hexstring(GET32(0x00245678));
+    hexstring(GET32(0x00345678));
+    // 0D    CR  '\r' (carriage ret)
+    uart_send(0x0D); 
+    // 0A    LF  '\n' (new line)
+    uart_send(0x0A);
+
+    // Then setup the MMU with at least those four sections
+    mmu_section(0x00000000,0x00000000,CACHEABLE | BUFFERABLE);
+    mmu_section(0x00100000,0x00100000,0x0000);
+    mmu_section(0x00200000,0x00200000,0x0000);
+    mmu_section(0x00300000,0x00300000,0x0000);
+    
+    //  and the peripherals:
+    mmu_section(0x20000000,0x20000000,0x0000); //NOT CACHED!
+    mmu_section(0x20200000,0x20200000,0x0000); //NOT CACHED!
+
+    // Start the MMU with the instruction and data caches enabled:
+    mmu_init ( MMUTABLEBASE );
+
+    // when we read those four addresses back we get the same output
+    // as we wrote before because we mapped virtual = physical
+    hexstring(GET32(0x00045678));
+    hexstring(GET32(0x00145678));
+    hexstring(GET32(0x00245678));
+    hexstring(GET32(0x00345678));
+    uart_send(0x0D); uart_send(0x0A);
+```
+
+But what if we swizzle things around? Don't mess with the 0x00000000-section, 
+because that is where our code is.
+
+```c
+    // change the table entries
+    mmu_section(0x00100000,0x00300000,0x0000);
+    mmu_section(0x00200000,0x00000000,0x0000);
+    mmu_section(0x00300000,0x00100000,0x0000);
+
+    // invalidate the TLB
+    invalidate_tlbs();
+
+    // and read the addresses again, which we wrote to above
+    hexstring(GET32(0x00045678));
+    hexstring(GET32(0x00145678));
+    hexstring(GET32(0x00245678));
+    hexstring(GET32(0x00345678));
+    uart_send(0x0D); uart_send(0x0A);
+```
+
+The 0x000xxxxx entry was not modified, so we get 0x000045678 as the output. The
+section 0x001xxxxx will read from physical addresses 0x003xxxxx so we get the 
+0x00345678 output, 0x002xxxxx will translate to the 0x000xxxxx space
+so that read gives 0x00045678 and the 0x003xxxxx is mapped to physical 
+0x001xxxxx giving 0x00145678 as the output.
+
+So up to this point the output looks like this:
+
+```
+    00045678
+    00145678
+    00245678
+    00345678
+
+    00045678
+    00145678
+    00245678
+    00345678
+
+    00045678
+    00345678
+    00045678
+    00145678
+```
+
+The first block is with the MMU disabled, the second one with MMU enabled but 
+1:1 virtual to physical translation, the third one with the non 1:1 translation.
+
+## Coarse paging
+
+With coarse paging the logic does not know what kind of translation will be done
+until the first level read, so the first step is identical to the translation
+above. If it sees a `0b01` as the lower 2 bits of the first level descriptor, 
+then it knows, it's a coarse page entry and it needs as second level fetch. 
+Table B4-5 Accessing coarse page table second-level descriptors (pg. B4-30) 
+shows the logic fetching the second level descriptor. 
+
+### Second level descriptor format
+
+There are two things to the two level translation. At first we need to set the 
+first level descriptors accordingly. The format is as follows Table B4-1 
+(pg. B4-27, First-level descriptor format (VMSAv6, subpages enabled)).
+
+|   | Bits 31:10 | 9 | 8:5 | 4,3,2 | 1,0 |
+|---|------------|---|-----|-------|-----|
+| Coarse Page Table | Coarse Page Table base Address | IMP | Domain | IMP | `01` |
+
+The bits 4:2 are implementation defined and should be zero (SBZ) for VMSAv6. 
+The domain-bits are used as above. The bits 31:10 are used as the base address 
+of the second level page table. This second level page table needs to be 
+aligned to 1 KiB in memory. 
+
+So after fetching the first level descriptor the bits 31:10 of the entry will 
+be used as bits 31:10 of the second level descriptor, i.e. it will be used as 
+base address for the second level table. The virtual address bits 19:12 will be
+used to navigate inside that second level table, i.e. shift completely to the 
+right, multiplied by 4 then used added onto the base address. The lowest 2 bits
+of the address of the second level entry is always zero. Note that there are 
+256 possibilities to fill the bits 19:12 of the virtual address, i.e. the 
+section is divided into 256 parts, i.e. 4 KiB pages.
+
+The second level descriptor looks like this (for small pages, for more see 
+Table 4-3 Second level descriptor format (subpages enabled), pg. 4-31):
+
+|   | Bits 31:12 | 11,10 | 9,8 | 7,6 | 5,4 | 3 | 2 | 1,0 |
+|---|------------|-------|-----|-----|-----|---|---|-----|
+| Small page | Small page base address | AP3 | AP2 | AP1 | AP0 | C | B | `10` |
+
+Note, that there are four **AP**-fields here. The small page is divided further 
+into four blocks of the same size (i.e. in the case of small pages 1 KiB), which
+have their own AP-Access control. AP0 applies to the block with the lowest base 
+address. You can set them all to `0b11` - full access and not care, or have 
+fine grained access control over the blocks of that page.
+
+### An example
+
+This example is a bit crazy, as the address of the second level descriptor is
+an address, where we don't even have RAM anymore on the Raspberry Pi. Normally
+you want to keep your second level tables somewhere near the first level table, 
+so you have the memory managment information in a confined space. 
+
+```
+    MMUTABLEBASE                          = 0x00004000
+    virtual address                       = 0x12345678
+    -> bits 19:12 of the virtual address:   0x00000045
+    address of the first level descriptor:  0x0000448c
+    let's assume a descriptor value of:     0xABCDE001
+    base address of second level table:     0xABCDE000
+    offset to the second level table:       0x00000114
+    address of the second level descriptor: 0xABCDE114
+```
+
+### A simple implementation
+
+```c
+    /**
+     * \brief creates an translation table entry (for sections of size 1 MiB)
+     * \param[in] virtual the virtual address (only top 12 bits used)
+     * \param[in] physical the physical address (only top 12 bits used)
+     * \param[in] flags the flags for the section
+     **/
+    uint32_t mmu_page ( uint32_t virtual, uint32_t physical, uint32_t flags, uint32_t secondbase )
+    {
+        uint32_t offset = virtual >> 20;
+        // plus and or are the same thing here, as MMUTABLEBASE is 14 bit aligned
+        uint32_t* entry = MMUTABLEBASE | (offset<<2);
+        // mask lower 20 bits of physical address then ORR flags and 0x01 for coarse translation
+        uint32_t entryval = (secondbase & 0xfffffc00) | (flags & 0xf0) | 0x01; 
+
+        // set first level descriptor
+        *entry = entryval;
+        
+        // mask everything except bits 19:12
+        offset = (virtual >> 12) & 0xff;
+        // form the second level
+        uint32_t* secondLevelEntry = (secondbase & 0xfffffc00) | (offset << 2);
+        
+        // form the value of the second level descriptor
+        // bytes 31:12 are the page base address, flags contain B,C, AP_x = 0b11 
+        // for all and the 0x02 at the end to identify the entry as small page
+        uint32_t physval = (physical & 0xfffff000) | 0xff0 | (flags & 0xc) | 0x02;
+        
+        // set the second level descriptor
+        *secondLevelEntry = physval;
+        return(0);
+    }
+```
+
+So let's assign some sections to coarse translation:
+
+```c
+    mmu_small(0x0AA45000,0x00145000,0,0x00000400);
+    mmu_small(0x0BB45000,0x00245000,0,0x00000800);
+    mmu_small(0x0CC45000,0x00345000,0,0x00000C00);
+    mmu_small(0x0DD45000,0x00345000,0,0x00001000);
+    mmu_small(0x0DD46000,0x00146000,0,0x00001000);
+    
+    invalidate_tlb();
+```
+
+Let's look in the last two `mmu_small`-statements here. you will notice, that
+the `secondbase`-parameter is the same here. This is in fact wanted, as I want 
+to add an entry into the secondary table I assigned before, not set a new one, 
+i.e. orphaning the old one. So let's assume I would set a new secondary table 
+base address like this:
+
+```c
+    mmu_small(0x0DD45000,0x00345000,0,0x00001000);
+    mmu_small(0x0DD46000,0x00146000,0,0x00001400);
+```
+
+When I try to access an address in the page 0x0DD45xxx, then the MMU would look
+inside a secondary table located at 0x00001400, which of course does not contain 
+our previously set entry for the small page. But it will definitely find 
+something there, and probably behave unexpected, if we are not aware of our 
+mistake here. So always make sure the secondary table base addresses of the 
+pages in the same section are the same. 
+
+## Access violation
+
+First we want to set a domain to 0x00, so accessing a section with that domain 
+will definitely trigger an access violation. I will assume we wrote that to 
+domain number 1. 
+
+```c
+    // set the domain of a section to 0x01
+    mmu_section(0x00100000,0x00100000,0x0020);
+    invalidate_tlb();
+
+    // then read the data from the sections
+    hexstring(GET32(0x00045678));
+    hexstring(GET32(0x00145678));
+    hexstring(GET32(0x00245678));
+    hexstring(GET32(0x00345678));
+    uart_send(0x0D); uart_send(0x0A);
+```
+
+I expect that second read-statement to trigger a Data Abort-Exception, so I 
+want to write an exception handler, to read the status information of that 
+exception. We need to following registers of the C15 co-processor:
+- `0,c5,c0,0` - Data Fault Status Register
+- `0,c5,c0,1` - Instruction Fault Status Register
+
+### Data Fault Status Register
+
+This register holds the source of the last data fault. The bits have the 
+following functions:
+
+| Bit 31:13 | 12 | 11 | 10 | 9 | 8 | 7:4 | 3:0 |
+|-----|----------|----|----|---|---|-----|-----|
+| SBZ | SD | RW | S | 0 | 0 | Domain | Status |
+
+**SD** indicates an AXI Decode or Slave error caused the abort (only valid for 
+external aborts, for all other should be zero). **RW** indicates whether a read 
+(0) or a write (1) access caused the abort. The **S**-flag is part of the status 
+field. The **Domain** bits indicate the domain which was accessed when the abort
+occurred. The **Status** bits show the type of fault generated. See the 
+[Data Fault Status Register]-manual for a list.
+
+### Instruction Fault Status Register
+
+This register holds the source of the last instruction fault. The bits have the following functions:
+
+| Bit 31:13 | 12 | 11 | 10 | 9 :4 | 3:0 |
+|-----|----------|----|----|------|-----|
+| SBZ | SD | SBZ | 0  | SBZ | Status |
+
+See the [Instruction Fault Status Register]-manual for the list of status combinations.
+
+### Reading the status registers
+
+```c
+    data_abort:
+        // save the link-register
+        mov r6,lr
+        // get the last executed instruction
+        ldr r8,[r6,#-8]
+        
+        // reading the status register
+        mrc p15,0,r4,c5,c0,0 ;@ data/combined
+        mrc p15,0,r5,c5,c0,1 ;@ instruction
+        mov sp,#0x00004000
+        
+        // print data fault status register
+        mov r0,r4
+        bl hexstring
+        
+        // print instruction fault status register
+        mov r0,r5
+        bl hexstring
+        
+        // print the link register
+        mov r0,r6
+        bl hexstring
+        
+        // print the bit-representation of the last executed instruction
+        mov r0,r8
+        bl hexstring
+        
+        b hang
+```
+
+Running the code results in:
+
+```
+    00045678 
+    00000019 
+    00000000 
+    00008104 
+    E5900000
+```
+
+The first line is the one correct data-read we do in our code above. The 
+next value is the data fault status register, which indicates that the domain
+0x01 was accessed and aborted with an 0x09 fault, i.e. a Domain Section fault. 
+The third line is the value of the instruction fault status register, which
+indicates a status of 0x0, i.e. "no function, reset value". That probably means 
+no fault happened.
+
+The fourth line is the link register, i.e. the address to the instruction which 
+would have been executed next, and the last line is the binary representation 
+of the instruction which caused the fault. Use a disassembler to view the 
+instruction in all its mnemonic glory:
+
+```
+    80fc:	e5900000 	ldr	r0, [r0]
+```
+
+# Conclusion
+
+This is just a simple intro to MMUs, just enough to be dangerous. The MMU is one
+of the simplest peripherals to program so long as bit manipulations are not 
+something that causes you to lose sleep. But if you mess it up even a bit, or 
+forget something, you can crash in spectacular ways (often silently without any 
+way of knowing what really happened. Debugging can be hard at best.
+
+The ARM ARM indicates that ARMv6 adds a feature of separating the data from the
+instructions from the MMUs perspective, which is an interesting thought (see the 
+JTAG-debugging comments). 
+
+[ARM ARM]: https://www.scss.tcd.ie/~waldroj/3d1/arm_arm.pdf
+[co-processor]: http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.ddi0301h/ch03s02s01.html
+[Data Fault Status Register]: http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.ddi0301h/Bgbiaghh.html
+[Instruction Fault Status Register]: http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.ddi0301h/Bgbccfgi.html
diff --git a/mmu/coarse_translation.ps b/mmu/coarse_translation.ps
deleted file mode 100644
index 234265d..0000000
Binary files a/mmu/coarse_translation.ps and /dev/null differ
diff --git a/mmu/notmain.c b/mmu/notmain.c
index a33719a..46fd6c2 100644
--- a/mmu/notmain.c
+++ b/mmu/notmain.c
@@ -1,15 +1,13 @@
-
-//-------------------------------------------------------------------------
-//-------------------------------------------------------------------------
+#include <stdint.h>
 
 extern void PUT32 ( unsigned int, unsigned int );
 extern void PUT16 ( unsigned int, unsigned int );
 extern unsigned int GET32 ( unsigned int );
 
-extern void start_mmu ( unsigned int, unsigned int );
+extern void mmu_init ( uint32_t );
+extern void mmu_domain ( uint32_t );
 extern void stop_mmu ( void );
-extern void invalidate_tlbs ( void );
-extern void invalidate_caches ( void );
+extern void invalidate_tlb ( void );
 
 extern void uart_init ( void );
 extern void uart_send ( unsigned int );
@@ -17,43 +15,64 @@ extern void uart_send ( unsigned int );
 extern void hexstrings ( unsigned int );
 extern void hexstring ( unsigned int );
 
-unsigned int system_timer_low ( void );
-
 #define MMUTABLEBASE 0x00004000
 
-//-------------------------------------------------------------------
-unsigned int mmu_section ( unsigned int vadd, unsigned int padd, unsigned int flags )
+#define CACHEABLE 0x08
+#define BUFFERABLE 0x04
+
+/**
+ * \brief creates an translation table entry (for sections of size 1 MiB)
+ * \param[in] virtual the virtual address (only top 12 bits used)
+ * \param[in] physical the physical address (only top 12 bits used)
+ * \param[in] flags the flags for the section
+ **/
+uint32_t mmu_section ( uint32_t virtual, uint32_t physical, uint32_t flags )
 {
-    unsigned int ra;
-    unsigned int rb;
-    unsigned int rc;
-
-    ra=vadd>>20;
-    rb=MMUTABLEBASE|(ra<<2);
-    rc=(padd&0xFFF00000)|0xC00|flags|2;
-    //hexstrings(rb); hexstring(rc);
-    PUT32(rb,rc);
+    uint32_t offset = virtual >> 20;
+    // plus and or are the same thing here, as MMUTABLEBASE is 14 bit aligned
+    uint32_t* entry = (uint32_t*) (MMUTABLEBASE | (offset<<2));
+    
+    // mask lower 20 bits of physical address then ORR flags and 0x02 for 1 MiB
+    uint32_t physval = (physical & 0xfff00000) | (flags & 0x7ffa) | 0x02; 
+
+    *entry = physval;
     return(0);
 }
-//-------------------------------------------------------------------
-unsigned int mmu_small ( unsigned int vadd, unsigned int padd, unsigned int flags, unsigned int mmubase )
+
+
+/**
+ * \brief creates an translation table entry (for sections of size 1 MiB)
+ * \param[in] virtual the virtual address (only top 12 bits used)
+ * \param[in] physical the physical address (only top 12 bits used)
+ * \param[in] flags the flags for the section
+ **/
+uint32_t mmu_page ( uint32_t virtual, uint32_t physical, uint32_t flags, uint32_t secondbase )
 {
-    unsigned int ra;
-    unsigned int rb;
-    unsigned int rc;
-
-    ra=vadd>>20;
-    rb=MMUTABLEBASE|(ra<<2);
-    rc=(mmubase&0xFFFFFC00)/*|(domain<<5)*/|1;
-    //hexstrings(rb); hexstring(rc);
-    PUT32(rb,rc); //first level descriptor
-    ra=(vadd>>12)&0xFF;
-    rb=(mmubase&0xFFFFFC00)|(ra<<2);
-    rc=(padd&0xFFFFF000)|(0xFF0)|flags|2;
-    //hexstrings(rb); hexstring(rc);
-    PUT32(rb,rc); //second level descriptor
+    uint32_t offset = virtual >> 20;
+    // plus and or are the same thing here, as MMUTABLEBASE is 14 bit aligned
+    uint32_t* entry = (uint32_t*) (MMUTABLEBASE | (offset<<2));
+    // mask lower 20 bits of physical address then ORR flags and 0x01 for coarse translation
+    uint32_t entryval = (secondbase & 0xfffffc00) | (flags & 0xf0) | 0x01; 
+
+    // set first level descriptor
+    *entry = entryval;
+    
+    // mask everything except bits 19:12
+    offset = (virtual >> 12) & 0xff;
+    // form the second level
+    uint32_t* secondLevelEntry = (uint32_t*) ((secondbase & 0xfffffc00) | (offset << 2));
+    
+    // form the value of the second level descriptor
+    // bytes 31:12 are the page base address, flags contain B,C, AP_x = 0b11 
+    // for all and the 0x02 at the end to identify the entry as small page
+    uint32_t physval = (physical & 0xfffff000) | 0xff0 | (flags & 0xc) | 0x02;
+    
+    // set the second level descriptor
+    *secondLevelEntry = physval;
     return(0);
 }
+
+    
 //------------------------------------------------------------------------
 int notmain ( void )
 {
@@ -95,7 +114,7 @@ int notmain ( void )
     mmu_section(0x20000000,0x20000000,0x0000); //NOT CACHED!
     mmu_section(0x20200000,0x20200000,0x0000); //NOT CACHED!
 
-    start_mmu(MMUTABLEBASE,0x00000001|0x1000|0x0004); //[23]=0 subpages enabled = legacy ARMv4,v5 and v6
+    mmu_init( MMUTABLEBASE );
 
     hexstring(GET32(0x00045678));
     hexstring(GET32(0x00145678));
@@ -106,7 +125,7 @@ int notmain ( void )
     mmu_section(0x00100000,0x00300000,0x0000);
     mmu_section(0x00200000,0x00000000,0x0000);
     mmu_section(0x00300000,0x00100000,0x0000);
-    invalidate_tlbs();
+    invalidate_tlb();
 
     hexstring(GET32(0x00045678));
     hexstring(GET32(0x00145678));
@@ -114,16 +133,16 @@ int notmain ( void )
     hexstring(GET32(0x00345678));
     uart_send(0x0D); uart_send(0x0A);
 
-    mmu_small(0x0AA45000,0x00145000,0,0x00000400);
-    mmu_small(0x0BB45000,0x00245000,0,0x00000800);
-    mmu_small(0x0CC45000,0x00345000,0,0x00000C00);
-    mmu_small(0x0DD45000,0x00345000,0,0x00001000);
-    mmu_small(0x0DD46000,0x00146000,0,0x00001000);
+    mmu_page(0x0AA45000,0x00145000,0,0x00000400);
+    mmu_page(0x0BB45000,0x00245000,0,0x00000800);
+    mmu_page(0x0CC45000,0x00345000,0,0x00000C00);
+    mmu_page(0x0DD45000,0x00345000,0,0x00001000);
+    mmu_page(0x0DD46000,0x00146000,0,0x00001000);
     //put these back
     mmu_section(0x00100000,0x00100000,0x0000);
     mmu_section(0x00200000,0x00200000,0x0000);
     mmu_section(0x00300000,0x00300000,0x0000);
-    invalidate_tlbs();
+    invalidate_tlb();
 
     hexstring(GET32(0x0AA45678));
     hexstring(GET32(0x0BB45678));
@@ -137,9 +156,9 @@ int notmain ( void )
     uart_send(0x0D); uart_send(0x0A);
 
     //access violation.
-
+    mmu_domain ( 0xffffff03 );
     mmu_section(0x00100000,0x00100000,0x0020);
-    invalidate_tlbs();
+    invalidate_tlb();
 
     hexstring(GET32(0x00045678));
     hexstring(GET32(0x00145678));
diff --git a/mmu/novectors.s b/mmu/novectors.s
index deaf533..a1e36fd 100644
--- a/mmu/novectors.s
+++ b/mmu/novectors.s
@@ -74,22 +74,32 @@ handler:
     b hang
 
 data_abort:
+    // save the link-register
     mov r6,lr
+    // get the last executed instruction
     ldr r8,[r6,#-8]
+    
+    // reading the status register
     mrc p15,0,r4,c5,c0,0 ;@ data/combined
     mrc p15,0,r5,c5,c0,1 ;@ instruction
     mov sp,#0x00004000
-    bl hexstring
+    
+    // print data fault status register
     mov r0,r4
     bl hexstring
+    
+    // print instruction fault status register
     mov r0,r5
     bl hexstring
+    
+    // print the link register
     mov r0,r6
     bl hexstring
+    
+    // print the bit-representation of the last executed instruction
     mov r0,r8
     bl hexstring
-    mov r0,r7
-    bl hexstring
+    
     b hang
 
 .globl PUT32
@@ -106,25 +116,35 @@ GET32:
 dummy:
     bx lr
 
-.globl start_mmu
-start_mmu:
-    mov r2,#0
-    mcr p15,0,r2,c7,c7,0 ;@ invalidate caches
-    mcr p15,0,r2,c8,c7,0 ;@ invalidate tlb
-    mcr p15,0,r2,c7,c10,4 ;@ DSB ??
-
-    mvn r2,#0
-    bic r2,#0xC
-    mcr p15,0,r2,c3,c0,0 ;@ domain
-
+.global mmu_domain
+mmu_domain:
+    mcr p15, 0, r0, c3, c0, 0
+    mov pc, lr
+
+.global mmu_init
+mmu_init:
+    mov r1,#0
+    // invalidate caches
+    mcr p15,0,r1,c7,c7,0 
+    // invalidate TLB entries
+    mcr p15,0,r1,c8,c7,0 
+    // data synchronisation barrier
+    mcr p15,0,r1,c7,c10,4 
+    
+    // set all domains to 0b11
+    ldr r1, =0xffffffff
+    mcr p15,0,r1,c3,c0,0
+    
+    // set the translation table base address (remember to align 16 KiB!)
     mcr p15,0,r0,c2,c0,0 ;@ tlb base
-    mcr p15,0,r0,c2,c0,1 ;@ tlb base
-
+    
+    // set the bits mentioned above
+    ldr r1, =0x00401805
     mrc p15,0,r2,c1,c0,0
     orr r2,r2,r1
     mcr p15,0,r2,c1,c0,0
-
-    bx lr
+    
+    mov pc, lr
 
 .globl stop_mmu
 stop_mmu:
@@ -135,8 +155,8 @@ stop_mmu:
     mcr p15,0,r2,c1,c0,0
     bx lr
 
-.globl invalidate_tlbs
-invalidate_tlbs:
+.globl invalidate_tlb
+invalidate_tlb:
     mov r2,#0
     mcr p15,0,r2,c8,c7,0  ;@ invalidate tlb
     mcr p15,0,r2,c7,c10,4 ;@ DSB ??
diff --git a/mmu/periph.c b/mmu/periph.c
index 2e0d49b..a2489a4 100644
--- a/mmu/periph.c
+++ b/mmu/periph.c
@@ -6,7 +6,6 @@ extern void PUT32 ( unsigned int, unsigned int );
 extern void PUT16 ( unsigned int, unsigned int );
 extern void PUT8 ( unsigned int, unsigned int );
 extern unsigned int GET32 ( unsigned int );
-extern void BRANCHTO ( unsigned int );
 extern void dummy ( unsigned int );
 
 #define SYSTIMERCLO     (0x20003004)
@@ -120,13 +119,7 @@ void uart_init ( void )
     PUT32(GPPUDCLK0,0);
     PUT32(AUX_MU_CNTL_REG,3);
 }
-//-------------------------------------------------------------------------
-unsigned int system_timer_low ( void )
-{
-    return(GET32(SYSTIMERCLO));
-}
-//-------------------------------------------------------------------------
-//-------------------------------------------------------------------------
+
 
 
 //-------------------------------------------------------------------------
diff --git a/mmu/section_translation.ps b/mmu/section_translation.ps
deleted file mode 100644
index d81ebdc..0000000
Binary files a/mmu/section_translation.ps and /dev/null differ