Monday 30 November 2015

Minimal GCC setup for K64F

Introduction

Usually, vendor of Hardware Development Kit (for small embedded applications) delivers ready-to-use SDK. See for instance ST, Nordic or Freedom. Inside such SDK there are number of modules: startup code, drivers, libraries, toolchains, sometimes there is already ported an OS and/or network stack. Usually the same SDK is shared between variety of products of the same vendor (for instance different boards based on the same SoC). However, vendors can favor a toolchain that you're not using at all. SDKs are indispensable in general, but what if you want to just light a LED on your board and don't want to dig into details of complex SDK? What if IDE for which projects are configured by default is not your favorite? Let's see what is actually minimal possible GCC setup for running K64F (Cortex M4) without all the legacy startup code like this (example from Kinetis SDK):

116 void SystemInit (void) {
117 #if ((__FPU_PRESENT == 1) && (__FPU_USED == 1))
118   SCB->CPACR |= ((3UL << 10*2) | (3UL << 11*2));    /* set CP10, CP11 Full Access */
119 #endif /* ((__FPU_PRESENT == 1) && (__FPU_USED == 1)) */
120 #if (DISABLE_WDOG)
121   /* WDOG->UNLOCK: WDOGUNLOCK=0xC520 */
122   WDOG->UNLOCK = WDOG_UNLOCK_WDOGUNLOCK(0xC520); /* Key 1 */
123   /* WDOG->UNLOCK: WDOGUNLOCK=0xD928 */
124   WDOG->UNLOCK = WDOG_UNLOCK_WDOGUNLOCK(0xD928); /* Key 2 */
125   /* WDOG->STCTRLH: ?=0,DISTESTWDOG=0,BYTESEL=0,TESTSEL=0,TESTWDOG=0,?=0,?=1,WAITEN=1,STOPEN=1,DBGEN=0,ALLOWUPDATE=1,WINEN=0,IRQRSTEN=0,CLKSRC=1,WDOGEN=0     */
126   WDOG->STCTRLH = WDOG_STCTRLH_BYTESEL(0x00) |
127                  WDOG_STCTRLH_WAITEN_MASK |
128                  WDOG_STCTRLH_STOPEN_MASK |
129                  WDOG_STCTRLH_ALLOWUPDATE_MASK |
130                  WDOG_STCTRLH_CLKSRC_MASK |
131                  0x0100U;
132 #endif /* (DISABLE_WDOG) */
133 #ifdef CLOCK_SETUP
134   if((RCM->SRS0 & RCM_SRS0_WAKEUP_MASK) != 0x00U)
135   {
136     if((PMC->REGSC & PMC_REGSC_ACKISO_MASK) != 0x00U)
137     {
138        PMC->REGSC |= PMC_REGSC_ACKISO_MASK; /* Release hold with ACKISO:  Only has an effect if recovering from VLLSx.*/
139     }
140   } else {
141 #ifdef SYSTEM_RTC_CR_VALUE
142     SIM_SCGC6 |= SIM_SCGC6_RTC_MASK;
143     if ((RTC_CR & RTC_CR_OSCE_MASK) == 0x00U) { /* Only if the OSCILLATOR is not already enabled */
144       RTC_CR = (uint32_t)((RTC_CR & (uint32_t)~(uint32_t)(RTC_CR_SC2P_MASK | RTC_CR_SC4P_MASK | RTC_CR_SC8P_MASK | RTC_CR_SC16P_MASK)) | (uint32_t)SYSTEM    _RTC_CR_VALUE);
145       RTC_CR |= (uint32_t)RTC_CR_OSCE_MASK;
146       RTC_CR &= (uint32_t)~(uint32_t)RTC_CR_CLKO_MASK;
147     }
148 #endif
149   }
150 
151   /* Power mode protection initialization */
152 #ifdef SYSTEM_SMC_PMPROT_VALUE
153   SMC->PMPROT = SYSTEM_SMC_PMPROT_VALUE;
154 #endif
155 
156   /* System clock initialization */
157   /* Internal reference clock trim initialization */
158 #if defined(SLOW_TRIM_ADDRESS)
159   if ( *((uint8_t*)SLOW_TRIM_ADDRESS) != 0xFFU) {                              /* Skip if non-volatile flash memory is erased */
160     MCG->C3 = *((uint8_t*)SLOW_TRIM_ADDRESS);
161   #endif /* defined(SLOW_TRIM_ADDRESS) */
162   #if defined(SLOW_FINE_TRIM_ADDRESS)
163     MCG->C4 = (MCG->C4 & ~(MCG_C4_SCFTRIM_MASK)) | ((*((uint8_t*) SLOW_FINE_TRIM_ADDRESS)) & MCG_C4_SCFTRIM_MASK);
164   #endif
 155 
156   /* System clock initialization */
157   /* Internal reference clock trim initialization */
158 #if defined(SLOW_TRIM_ADDRESS)
159   if ( *((uint8_t*)SLOW_TRIM_ADDRESS) != 0xFFU) {                              /* Skip if non-volatile flash memory is erased */
160     MCG->C3 = *((uint8_t*)SLOW_TRIM_ADDRESS);
161   #endif /* defined(SLOW_TRIM_ADDRESS) */
162   #if defined(SLOW_FINE_TRIM_ADDRESS)
163     MCG->C4 = (MCG->C4 & ~(MCG_C4_SCFTRIM_MASK)) | ((*((uint8_t*) SLOW_FINE_TRIM_ADDRESS)) & MCG_C4_SCFTRIM_MASK);
164   #endif
165   #if defined(FAST_TRIM_ADDRESS)
166     MCG->C4 = (MCG->C4 & ~(MCG_C4_FCTRIM_MASK)) |((*((uint8_t*) FAST_TRIM_ADDRESS)) & MCG_C4_FCTRIM_MASK);
167   #endif
168   #if defined(FAST_FINE_TRIM_ADDRESS)
169     MCG->C2 = (MCG->C2 & ~(MCG_C2_FCFTRIM_MASK)) | ((*((uint8_t*)FAST_TRIM_ADDRESS)) & MCG_C2_FCFTRIM_MASK);
170   #endif /* defined(FAST_FINE_TRIM_ADDRESS) */
171 #if defined(SLOW_TRIM_ADDRESS)
172   }
173   #endif /* defined(SLOW_TRIM_ADDRESS) */
174 
175   /* Set system prescalers and clock sources */
176   SIM->CLKDIV1 = SYSTEM_SIM_CLKDIV1_VALUE; /* Set system prescalers */
177   SIM->SOPT1 = ((SIM->SOPT1) & (uint32_t)(~(SIM_SOPT1_OSC32KSEL_MASK))) | ((SYSTEM_SIM_SOPT1_VALUE) & (SIM_SOPT1_OSC32KSEL_MASK)); /* Set 32 kHz clock so    urce (ERCLK32K) */
178   SIM->SOPT2 = ((SIM->SOPT2) & (uint32_t)(~(SIM_SOPT2_PLLFLLSEL_MASK))) | ((SYSTEM_SIM_SOPT2_VALUE) & (SIM_SOPT2_PLLFLLSEL_MASK)); /* Selects the high fr    equency clock for various peripheral clocking options. */
179 #if ((MCG_MODE == MCG_MODE_FEI) || (MCG_MODE == MCG_MODE_FBI) || (MCG_MODE == MCG_MODE_BLPI))
180   /* Set MCG and OSC */
181 #if  ((((SYSTEM_OSC_CR_VALUE) & OSC_CR_ERCLKEN_MASK) != 0x00U) || ((((SYSTEM_MCG_C5_VALUE) & MCG_C5_PLLCLKEN0_MASK) != 0x00U) && (((SYSTEM_MCG_C7_VALUE)     & MCG_C7_OSCSEL_MASK) == 0x00U)))
182   /* SIM_SCGC5: PORTA=1 */
         183   SIM_SCGC5 |= SIM_SCGC5_PORTA_MASK;
184   /* PORTA_PCR18: ISF=0,MUX=0 */
185   PORTA_PCR18 &= (uint32_t)~(uint32_t)((PORT_PCR_ISF_MASK | PORT_PCR_MUX(0x07)));
186   if (((SYSTEM_MCG_C2_VALUE) & MCG_C2_EREFS_MASK) != 0x00U) {
187   /* PORTA_PCR19: ISF=0,MUX=0 */
188   PORTA_PCR19 &= (uint32_t)~(uint32_t)((PORT_PCR_ISF_MASK | PORT_PCR_MUX(0x07)));
189   }
190 #endif
191   MCG->SC = SYSTEM_MCG_SC_VALUE;       /* Set SC (fast clock internal reference divider) */
192   MCG->C1 = SYSTEM_MCG_C1_VALUE;       /* Set C1 (clock source selection, FLL ext. reference divider, int. reference enable etc.) */
193   /* Check that the source of the FLL reference clock is the requested one. */
194   if (((SYSTEM_MCG_C1_VALUE) & MCG_C1_IREFS_MASK) != 0x00U) {
195     while((MCG->S & MCG_S_IREFST_MASK) == 0x00U) {
196     }
197   } else {
198     while((MCG->S & MCG_S_IREFST_MASK) != 0x00U) {
199     }
200   }
201   MCG->C2 = (MCG->C2 & (uint8_t)(~(MCG_C2_FCFTRIM_MASK))) | (SYSTEM_MCG_C2_VALUE & (uint8_t)(~(MCG_C2_LP_MASK))); /* Set C2 (freq. range, ext. and int. r    eference selection etc. excluding trim bits; low power bit is set later) */
202   MCG->C4 = ((SYSTEM_MCG_C4_VALUE) & (uint8_t)(~(MCG_C4_FCTRIM_MASK | MCG_C4_SCFTRIM_MASK))) | (MCG->C4 & (MCG_C4_FCTRIM_MASK | MCG_C4_SCFTRIM_MASK)); /*     Set C4 (FLL output; trim values not changed) */
203   OSC->CR = SYSTEM_OSC_CR_VALUE;       /* Set OSC_CR (OSCERCLK enable, oscillator capacitor load) */
204   MCG->C7 = SYSTEM_MCG_C7_VALUE;       /* Set C7 (OSC Clock Select) */
205   #if (MCG_MODE == MCG_MODE_BLPI)
206   /* BLPI specific */
207   MCG->C2 |= (MCG_C2_LP_MASK);         /* Disable FLL and PLL in bypass mode */
208   #endif
209 
210 #else /* MCG_MODE */
211   /* Set MCG and OSC */
212 #if  (((SYSTEM_OSC_CR_VALUE) & OSC_CR_ERCLKEN_MASK) != 0x00U) || (((SYSTEM_MCG_C7_VALUE) & MCG_C7_OSCSEL_MASK) == 0x00U)
213   /* SIM_SCGC5: PORTA=1 */
214   SIM_SCGC5 |= SIM_SCGC5_PORTA_MASK;
215   /* PORTA_PCR18: ISF=0,MUX=0 */
216   PORTA_PCR18 &= (uint32_t)~(uint32_t)((PORT_PCR_ISF_MASK | PORT_PCR_MUX(0x07)));
217   if (((SYSTEM_MCG_C2_VALUE) & MCG_C2_EREFS_MASK) != 0x00U) {
218   /* PORTA_PCR19: ISF=0,MUX=0 */
219   PORTA_PCR19 &= (uint32_t)~(uint32_t)((PORT_PCR_ISF_MASK | PORT_PCR_MUX(0x07)));
220   }
221 #endif
222   MCG->SC = SYSTEM_MCG_SC_VALUE;       /* Set SC (fast clock internal reference divider) */
223   MCG->C2 = (MCG->C2 & (uint8_t)(~(MCG_C2_FCFTRIM_MASK))) | (SYSTEM_MCG_C2_VALUE & (uint8_t)(~(MCG_C2_LP_MASK))); /* Set C2 (freq. range, ext. and int. r    eference selection etc. excluding trim bits; low power bit is set later) */
224   OSC->CR = SYSTEM_OSC_CR_VALUE;       /* Set OSC_CR (OSCERCLK enable, oscillator capacitor load) */
225   MCG->C7 = SYSTEM_MCG_C7_VALUE;       /* Set C7 (OSC Clock Select) */
226   #if (MCG_MODE == MCG_MODE_PEE)
227   MCG->C1 = (SYSTEM_MCG_C1_VALUE) | MCG_C1_CLKS(0x02); /* Set C1 (clock source selection, FLL ext. reference divider, int. reference enable etc.) - PBE m    ode*/
228   #else
229   MCG->C1 = SYSTEM_MCG_C1_VALUE;       /* Set C1 (clock source selection, FLL ext. reference divider, int. reference enable etc.) */
230   #endif
231   if ((((SYSTEM_MCG_C2_VALUE) & MCG_C2_EREFS_MASK) != 0x00U) && (((SYSTEM_MCG_C7_VALUE) & MCG_C7_OSCSEL_MASK) == 0x00U)) {
232     while((MCG->S & MCG_S_OSCINIT0_MASK) == 0x00U) { /* Check that the oscillator is running */
233     }
234   }
235   /* Check that the source of the FLL reference clock is the requested one. */
236   if (((SYSTEM_MCG_C1_VALUE) & MCG_C1_IREFS_MASK) != 0x00U) {
237     while((MCG->S & MCG_S_IREFST_MASK) == 0x00U) {
238     }
239   } else {
240     while((MCG->S & MCG_S_IREFST_MASK) != 0x00U) {
241     }
242   }
243   MCG->C4 = ((SYSTEM_MCG_C4_VALUE)  & (uint8_t)(~(MCG_C4_FCTRIM_MASK | MCG_C4_SCFTRIM_MASK))) | (MCG->C4 & (MCG_C4_FCTRIM_MASK | MCG_C4_SCFTRIM_MASK)); /    * Set C4 (FLL output; trim values not changed) */
244 #endif /* MCG_MODE */
245 
246   /* Common for all MCG modes */
247 
248   /* PLL clock can be used to generate clock for some devices regardless of clock generator (MCGOUTCLK) mode. */
249   MCG->C5 = (SYSTEM_MCG_C5_VALUE) & (uint8_t)(~(MCG_C5_PLLCLKEN0_MASK)); /* Set C5 (PLL settings, PLL reference divider etc.) */
250   MCG->C6 = (SYSTEM_MCG_C6_VALUE) & (uint8_t)~(MCG_C6_PLLS_MASK); /* Set C6 (PLL select, VCO divider etc.) */
251   if ((SYSTEM_MCG_C5_VALUE) & MCG_C5_PLLCLKEN0_MASK) {
252     MCG->C5 |= MCG_C5_PLLCLKEN0_MASK;  /* PLL clock enable in mode other than PEE or PBE */
253   }
253   }
254   /* BLPE, PEE and PBE MCG mode specific */
255 
256 #if (MCG_MODE == MCG_MODE_BLPE)
257   MCG->C2 |= (MCG_C2_LP_MASK);         /* Disable FLL and PLL in bypass mode */
258 #elif ((MCG_MODE == MCG_MODE_PBE) || (MCG_MODE == MCG_MODE_PEE))
259   MCG->C6 |= (MCG_C6_PLLS_MASK);       /* Set C6 (PLL select, VCO divider etc.) */
260   while((MCG->S & MCG_S_LOCK0_MASK) == 0x00U) { /* Wait until PLL is locked*/
261   }
262   #if (MCG_MODE == MCG_MODE_PEE)
263   MCG->C1 &= (uint8_t)~(MCG_C1_CLKS_MASK);
264   #endif
265 #endif
266 #if ((MCG_MODE == MCG_MODE_FEI) || (MCG_MODE == MCG_MODE_FEE))
267   while((MCG->S & MCG_S_CLKST_MASK) != 0x00U) { /* Wait until output of the FLL is selected */
268   }
269   /* Use LPTMR to wait for 1ms dor FLL clock stabilization */
270   SIM_SCGC5 |= SIM_SCGC5_LPTMR_MASK;   /* Alow software control of LPMTR */
271   LPTMR0->CMR = LPTMR_CMR_COMPARE(0);  /* Default 1 LPO tick */
272   LPTMR0->CSR = (LPTMR_CSR_TCF_MASK | LPTMR_CSR_TPS(0x00));
273   LPTMR0->PSR = (LPTMR_PSR_PCS(0x01) | LPTMR_PSR_PBYP_MASK); /* Clock source: LPO, Prescaler bypass enable */
274   LPTMR0->CSR = LPTMR_CSR_TEN_MASK;    /* LPMTR enable */
275   while((LPTMR0_CSR & LPTMR_CSR_TCF_MASK) == 0u) {
276   }
277   LPTMR0_CSR = 0x00;                   /* Disable LPTMR */
278   SIM_SCGC5 &= (uint32_t)~(uint32_t)SIM_SCGC5_LPTMR_MASK;
279 #elif ((MCG_MODE == MCG_MODE_FBI) || (MCG_MODE == MCG_MODE_BLPI))
280   while((MCG->S & MCG_S_CLKST_MASK) != 0x04U) { /* Wait until internal reference clock is selected as MCG output */
281   }
282 #elif ((MCG_MODE == MCG_MODE_FBE) || (MCG_MODE == MCG_MODE_PBE) || (MCG_MODE == MCG_MODE_BLPE))
283   while((MCG->S & MCG_S_CLKST_MASK) != 0x08U) { /* Wait until external reference clock is selected as MCG output */
284   }
285 #elif (MCG_MODE == MCG_MODE_PEE)
286   while((MCG->S & MCG_S_CLKST_MASK) != 0x0CU) { /* Wait until output of the PLL is selected */
287   }
288 #endif
289 #if (((SYSTEM_SMC_PMCTRL_VALUE) & SMC_PMCTRL_RUNM_MASK) == (0x02U << SMC_PMCTRL_RUNM_SHIFT))
290   SMC->PMCTRL = (uint8_t)((SYSTEM_SMC_PMCTRL_VALUE) & (SMC_PMCTRL_RUNM_MASK)); /* Enable VLPR mode */
291   while(SMC->PMSTAT != 0x04U) {        /* Wait until the system is in VLPR mode */
292   }
293 #endif
294 
295 #if defined(SYSTEM_SIM_CLKDIV2_VALUE)
296   SIM->CLKDIV2 = ((SIM->CLKDIV2) & (uint32_t)(~(SIM_CLKDIV2_USBFRAC_MASK | SIM_CLKDIV2_USBDIV_MASK))) | ((SYSTEM_SIM_CLKDIV2_VALUE) & (SIM_CLKDIV2_USBFRA    C_MASK | SIM_CLKDIV2_USBDIV_MASK)); /* Selects the USB clock divider. */
297 #endif
253   }
254   /* BLPE, PEE and PBE MCG mode specific */
255 
256 #if (MCG_MODE == MCG_MODE_BLPE)
257   MCG->C2 |= (MCG_C2_LP_MASK);         /* Disable FLL and PLL in bypass mode */
258 #elif ((MCG_MODE == MCG_MODE_PBE) || (MCG_MODE == MCG_MODE_PEE))
259   MCG->C6 |= (MCG_C6_PLLS_MASK);       /* Set C6 (PLL select, VCO divider etc.) */
260   while((MCG->S & MCG_S_LOCK0_MASK) == 0x00U) { /* Wait until PLL is locked*/
261   }
262   #if (MCG_MODE == MCG_MODE_PEE)
263   MCG->C1 &= (uint8_t)~(MCG_C1_CLKS_MASK);
264   #endif
265 #endif
266 #if ((MCG_MODE == MCG_MODE_FEI) || (MCG_MODE == MCG_MODE_FEE))
267   while((MCG->S & MCG_S_CLKST_MASK) != 0x00U) { /* Wait until output of the FLL is selected */
268   }
269   /* Use LPTMR to wait for 1ms dor FLL clock stabilization */
270   SIM_SCGC5 |= SIM_SCGC5_LPTMR_MASK;   /* Alow software control of LPMTR */
271   LPTMR0->CMR = LPTMR_CMR_COMPARE(0);  /* Default 1 LPO tick */
272   LPTMR0->CSR = (LPTMR_CSR_TCF_MASK | LPTMR_CSR_TPS(0x00));
273   LPTMR0->PSR = (LPTMR_PSR_PCS(0x01) | LPTMR_PSR_PBYP_MASK); /* Clock source: LPO, Prescaler bypass enable */
274   LPTMR0->CSR = LPTMR_CSR_TEN_MASK;    /* LPMTR enable */
275   while((LPTMR0_CSR & LPTMR_CSR_TCF_MASK) == 0u) {
276   }
277   LPTMR0_CSR = 0x00;                   /* Disable LPTMR */
278   SIM_SCGC5 &= (uint32_t)~(uint32_t)SIM_SCGC5_LPTMR_MASK;
279 #elif ((MCG_MODE == MCG_MODE_FBI) || (MCG_MODE == MCG_MODE_BLPI))
280   while((MCG->S & MCG_S_CLKST_MASK) != 0x04U) { /* Wait until internal reference clock is selected as MCG output */
281   }
282 #elif ((MCG_MODE == MCG_MODE_FBE) || (MCG_MODE == MCG_MODE_PBE) || (MCG_MODE == MCG_MODE_BLPE))
283   while((MCG->S & MCG_S_CLKST_MASK) != 0x08U) { /* Wait until external reference clock is selected as MCG output */
284   }
285 #elif (MCG_MODE == MCG_MODE_PEE)
286   while((MCG->S & MCG_S_CLKST_MASK) != 0x0CU) { /* Wait until output of the PLL is selected */
287   }
288 #endif
289 #if (((SYSTEM_SMC_PMCTRL_VALUE) & SMC_PMCTRL_RUNM_MASK) == (0x02U << SMC_PMCTRL_RUNM_SHIFT))
290   SMC->PMCTRL = (uint8_t)((SYSTEM_SMC_PMCTRL_VALUE) & (SMC_PMCTRL_RUNM_MASK)); /* Enable VLPR mode */
291   while(SMC->PMSTAT != 0x04U) {        /* Wait until the system is in VLPR mode */
292   }
293 #endif
294 
295 #if defined(SYSTEM_SIM_CLKDIV2_VALUE)
296   SIM->CLKDIV2 = ((SIM->CLKDIV2) & (uint32_t)(~(SIM_CLKDIV2_USBFRAC_MASK | SIM_CLKDIV2_USBDIV_MASK))) | ((SYSTEM_SIM_CLKDIV2_VALUE) & (SIM_CLKDIV2_USBFRA    C_MASK | SIM_CLKDIV2_USBDIV_MASK)); /* Selects the USB clock divider. */
297 #endif
298 
299   /* PLL loss of lock interrupt request initialization */
300   if (((SYSTEM_MCG_C6_VALUE) & MCG_C6_LOLIE0_MASK) != 0U) {
301     NVIC_EnableIRQ(MCG_IRQn);          /* Enable PLL loss of lock interrupt request */
302   }
303 #endif
304 }

Seriously, this startup code scares me. I know that most of the parts are surrounded with #ifdefs but amount of "magic" values and general mess-codestyle really discourages me. Do I need all that stuff? There are big chances that for large project I do. However, I doubt I need them for lighting one LED. Let's start everything from scratch.

Getting started

Most steps below are specific to K64F, but you can find them helpful also as a general approach for bringing-up any board.

Assuming you have the hardware already:

.
  1. Download Reference Manual for Freedom K64 Sub-Family.
  2. Inspect "Table 4-1. System memory map":
  3. Notice, program code and read-only data (including exception vectors) are located between 0x00000000 and 0x07FFFFFF. RAM is split into two regions: 0x1FFF0000 0x1FFFFFFF and 0x20000000 0x2002FFFF.
  4. According to specs we have physically 1MB of flash and 256KB of RAM installed on the board. This gives us last actually available address for flash to be 0x00FFFFFF and indeed 0x2002FFFF as last address for RAM.
  5. You can now read more about SRAM split in the reference manual. For purpose of this article, we stick to upper region (the one starting at 0x20000000).
  6. Now, in many cases we would have all needed information. But in case of K64 family, we need to notice two more things. First one is "Flash configuration field". Refer to "29.3.1 Flash configuration field description" for details. In short words: addresses in flash between 0x00000400 and 0x000040C are very special. Values stored there configure other subsystems, so you cannot write it with your application data or code. Other thing is watchdog: "24.3.1 Unlocking and updating the watchdog". There is a following statement:

    "Write 0xC520 followed by 0xD928 within 20 bus clock cycles to a specific unlock register (WDOG_UNLOCK)".

    We'll need this information later.
  7. Now, if you don't know what vector table is, download ARM ARM for Cortex M4 (ARMv7-m) and see "B1.5.3 The vector table":

    "The vector table contains the initialization value for the stack pointer, and the entry point addresses of each exception handler."

    K64F expects vector table to be at address 0x00000000 by default.
Linker script

Because we're starting the project from scratch, we need to create our own linker script. Let's name it for example k64f.ld and start editing it:

  1 MEMORY     
  2 {
  3     ROM_VECTORS (rx) : ORIGIN = 0x00000000, LENGTH = 0x00000400
  4     ROM_FLASH_CFG (rx) : ORIGIN = 0x00000400, LENGTH = 0x00000010
  5     ROM_TEXT (rx) : ORIGIN = 0x00000410, LENGTH = 512K
  6     RAM (rw) : ORIGIN = 0x20000000, LENGTH = 192K
  7 }

This part of linker script will define our target memory layout. In this example I choose 512K as size of ROM_TEXT, but remember you can increase it up to 1MB - ROM_VECTORS length - ROM_FLASH_CFG length. Generally, we see in the layout three regions in flash (vectors, config and code) and one region in RAM. This matches our observations from K64 Reference Manual. The names "ROM_VECTORS", "ROM_FLASH_CFG" etc. are chosen arbitrarily.

Now, we need to define which input section from input files will go to which output section of ELF:
"You use input section descriptions to tell the linker how to map the input files into your memory layout."
By default, compiler implicitly will create following input sections:
  • text - for program code.
  • rodata - for read-only data like constants.
  • data - for initialized global variables.
  • bss and COMMON - for uninitialized global variables.
Those are very basic sections that we can expect when we're not linking with standard library. Of course we can add explicitly our own custom sections for special purposes (we'll see later how). Our custom input sections will be:
  • vectors - for storing vector table.
  • flash_config - for storing K64F specific configuration data.
The main task we can do in the linker script is mapping input sections to output sections and creating our own symbols. Example linker script that places input sections "text" and "rodata" in the output section called "text" looks like this:
 25     .text :
 26     {
 27         . = ALIGN(4);
 28         *(.text*)
 29         *(.rodata*)
 30         . = ALIGN(4);
 31     } > ROM_TEXT

This script also tells that output section "text" should be mapped into ROM_TEXT address (which was defined by us already). Above example shows also that we can align our counter to 4 bytes before processing input sections. Counter (dot) will be explained later. The main conclusion from above example is that all input sections named text* and rodata*  will be placed in text output section.

Besides mapping input sections into output sections we can also declare global symbols in the linker script. Those symbols can be very useful. See for instance following example:

 43     .bss :
 44     {
 45         . = ALIGN(4);
 46         __bss_start__ = . ;
 47         *(.bss*)
 48         *(COMMON)
 49         __bss_end__ = . ;
 50         . = ALIGN(4);
 51     } > RAM

We see here two custom symbols created: __bss_start__ and __bss_end__ (names chosen arbitrarily). Those symbols can be accessed from C code. The value of them is undetermined. However, the address of those symbols is defined and is equal to the value assigned to them in the linker script. In above example, assigned value was "." (dot). Dot is a special character in linker script syntax that holds current address of the processed memory layout. For example, before bss section was processed by linker, dot  could be equal to 0x20000000. After that, depending of how many uninitialized global variables were in the input files, the bss and COMMON sections will "stretch" the address space accordingly. Let's say bss was 256 bytes long and COMMON was 256 bytes long as well. After processing those two sections, dot will have value 0x20000200 (512 bytes from 0x20000000).  It means that symbol __bss_start__ will be created at address 0x20000000 and symbol __bss_end__ will be created at address 0x20000200. In C code (or through debugger) if you print value of __bss_start__you'll get garbage. If you print &__bss_start__ you'll get 0x20000000.

OK, so we know that in the linker script we can map input sections to output sections and that we can create our own symbols. We also know how can we create a memory layout. Here's complete linker script for our example project:

  1 MEMORY                                                                      
  2 {
  3     ROM_VECTORS (rx) : ORIGIN = 0x00000000, LENGTH = 0x00000400
  4     ROM_FLASH_CFG (rx) : ORIGIN = 0x00000400, LENGTH = 0x00000010
  5     ROM_TEXT (rx) : ORIGIN = 0x00000410, LENGTH = 512K
  6     RAM (rw) : ORIGIN = 0x20000000, LENGTH = 192K
  7 }
  8 
  9 SECTIONS
 10 {
 11     .vectors :
 12     {
 13         . = ALIGN(4);
 14         *(.vectors)
 15         . = ALIGN(4);
 16     } > ROM_VECTORS
 17 
 18     .flash_cfg :
 19     {
 20         . = ALIGN(4);
 21         *(.flash_config)
 22         . = ALIGN(4);
 23     } > ROM_FLASH_CFG
 24 
 25     .text :
 26     {
 27         . = ALIGN(4);
 28         *(.text*)
 29         *(.rodata*)
 30         . = ALIGN(4);
 31     } > ROM_TEXT
 32 
 33     _sfdata = LOADADDR(.data);
 34     .data :
 35     {
 36         . = ALIGN(4);
 37         _sdata = .;
 38         *(.data*)
 39         _edata = .;
 40         . = ALIGN(4);
 41     } > RAM AT> ROM_TEXT
 42 
 43     .bss :
 44     {
 45         . = ALIGN(4);
 46         __bss_start__ = . ;
 47         *(.bss*)
 48         *(COMMON)
 49         __bss_end__ = . ;
 50         . = ALIGN(4);
 51     } > RAM
 52 
 53     _stack_top = ORIGIN(RAM) + LENGTH(RAM);
 54 } 

It's very simplified linker script, without sections needed by standard library. What is also worth mentioning: because we need to create binary file as our output and because there is no ELF bootloader on the board, section data cannot be placed by linker directly at RAM. If we instruct it to do this, the output file will have ~512MB of size. This is because whole address space between code (around 0x00000000) and RAM (around 0x20000000) would be included as well. This is why we redirect it "AT > ROM_TEXT"  (line 41). This is also why we create _sfdata symbol. The whole concept is to store data on the flash and copy it into RAM at startup. We don't need to do the same with bss section because it's actually an empty section (always). Instead, we will need to zero address space between _bss_start__ and __bss_end__  on startup manually. Last thing worth mentioning is that we have created _stack_top symbol at last accessible RAM address (at the end of RAM). We'll need it later.

Startup code

As we know from ARM ARM, processor will do two things upon starting:
  • Load Stack Pointer with value stored at the beginning of vector table.
  • Execute reset handler. Address to the reset handler is stored just after stack pointer in the vector table.
Our task is to prepare vector table and reset handler. In our example, we don't care about any exceptions beside the reset. In real-life scenario whole vector table must be implemented. Let's create file startup.s:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
.cpu cortex-m4
.thumb

.section .vectors, "a"
    .word _stack_top
    .word _reset

.section .flash_config, "a"
    .long 0xFFFFFFFF
    .long 0xFFFFFFFF
    .long 0xFFFFFFFF
    .long 0xFFFFFFFE

.section .text
.thumb_func
.global _reset
_reset:
    bl init
    bl main

Above we can see how to create custom input sections (that we were talking about earlier), We've created vectors (line 4) and flash_config  (line 8) input sections. As we see, vector table contains only two entries. The first one is an address of initial SP and will be generated by our linker script. The second one is an address of reset handler and is defined in the same file at line 17. Section flash_config contains values specific for K64F. You can decode them using Reference Manual. Note, last byte in this configuration is FE (line 12).

So, after connecting power-supply, processor will write into SP address of _stack_top symbol and will branch into init function. Let's create startup.c file:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
#define WDOG_STCTRLH (*(volatile short *)0x40052000u)
#define WDOG_UNLOCK (*(volatile short *)0x4005200Eu)

#define WDOG_UNLOCK_WDOGUNLOCK_MASK 0xFFFFu
#define WDOG_UNLOCK_WDOGUNLOCK_SHIFT 0
#define WDOG_UNLOCK_WDOGUNLOCK_WIDTH 16
#define WDOG_UNLOCK_WDOGUNLOCK(x) (((short)(((short)(x))<<WDOG_UNLOCK_WDOGUNLOCK_SHIFT))&WDOG_UNLOCK_WDOGUNLOCK_MASK)

#define WDOG_STCTRLH_WAITEN_MASK 0x80u
#define WDOG_STCTRLH_STOPEN_MASK 0x40u
#define WDOG_STCTRLH_ALLOWUPDATE_MASK 0x10u
#define WDOG_STCTRLH_CLKSRC_MASK 0x2u

#define WDOG_STCTRLH_BYTESEL_MASK 0x3000u
#define WDOG_STCTRLH_BYTESEL_SHIFT 12
#define WDOG_STCTRLH_BYTESEL(x) (((short)(((short)(x))<<WDOG_STCTRLH_BYTESEL_SHIFT))&WDOG_STCTRLH_BYTESEL_MASK)

extern unsigned int _sfdata;
extern unsigned int _edata;
extern unsigned int _sdata;
extern unsigned int __bss_start__;
extern unsigned int __bss_end__;

void init()
{
    unsigned int *src, *dst;

    WDOG_UNLOCK = WDOG_UNLOCK_WDOGUNLOCK(0xC520);
    WDOG_UNLOCK = WDOG_UNLOCK_WDOGUNLOCK(0xD928);
    WDOG_STCTRLH = WDOG_STCTRLH_BYTESEL(0x00) |
        WDOG_STCTRLH_WAITEN_MASK |
        WDOG_STCTRLH_STOPEN_MASK |
        WDOG_STCTRLH_ALLOWUPDATE_MASK |
        WDOG_STCTRLH_CLKSRC_MASK |
        0x0100U;

    src = &_sfdata;

    for(dst = &_sdata; dst < &_edata;)
    {
        *(dst++) = *(src++);
    }

    for(src = &__bss_start__; src < &__bss_end__;)
    {
        *(src++) = 0;
    }

    return;
}

What's happening here? Three things:
  • Disable watchdog
  • Copy data sections to RAM
  • Zero bss section
That's all we need.
Values from lines 1-16 can be found in the Reference Manual. There are just a bunch of registers which need to be written in specific order to deactivate the watchdog. I've mentioned about it at the beginning. 

Next thing we're doing is using _sfdata_sdata and _edata symbols. All those symbols we've created in linker script. _sfdata is placed at address in flash where data section begins. _sdata is a symbol at address where data section should be placed in RAM. _edata is a symbol at address when data section should end.

Other symbols created in linker script (__bss_start__ and __bss_end__) are used as markers for address range which need to be zeroed. If we don't do this, our uninitialized global variables will have random values instead of expected zeros.

Application

As we see at line 31 of startup.s file, after init function returns we branch to the main function. Create main.c file:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
#define SIM_SCGC5 (*(volatile int *)0x40048038)
#define SIM_SCGC5_PORTB 10

#define PORTB_PCR21 (*(volatile int *)0x4004A054)
#define PORTB_PCR21_MUX 8

#define GPIOB_PDDR (*(volatile int *)0x400FF054)
#define PIN_N 21

int main()
{
    /* Enable clocks. */
    SIM_SCGC5 |= 1 << SIM_SCGC5_PORTB;
    /* Configure pin 21 as GPIO. */
    PORTB_PCR21 |= 1 << PORTB_PCR21_MUX;
    /* Configure GPIO pin 21 as output.
     * It will have a default output value set
     * to 0, so LED will light (negative logic).
     */
    GPIOB_PDDR |= 1 << PIN_N;

    while(1);

    return 0;
}

Here we actually light the LED. Instead of using includes from SDK we just defined register addresses in place. Note, in this particular example volatile keyword is not crucial. However, in general use case you expect from compiler to always generate direct load/store instructions to those addresses instead of trying to keep them in registers. This is because this memory could be modified from exception handler.

Makefile

So, we have almost everything done. We have following files: k64f.ld, startup.s, startup.c and main.c. Now, let's use k64f.ld as our linker script and compile together startup.s, startup.c and main.c. K64F expects the output file to be in binary format. We'll create the ELF file, and then we'll translate it into bin using tool called objcopy (I assume you have installed GCC ARM Embedded). Create Makefile:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
CC=arm-none-eabi-gcc
OBJCPY=arm-none-eabi-objcopy
CFLAGS=-Wall -Wextra -mthumb -mcpu=cortex-m4 -nostdlib -g

all:
 $(CC) startup.s startup.c main.c $(CFLAGS) -T k64f.ld -o simple.elf
 $(OBJCPY) simple.elf simple.bin -O binary

clean:
 rm simple.*

Option nostdlib will instruct linker to not include standard library. Using option -T we can point to our custom linker script. We'll have two files as output: simple.elf which can be used during debugging and simple.bin which can be uploaded to the board using standard OpenSDA interface. I encourage you to check by yourself how the generated simple.elf file looks internally by issuing arm-none-eabi-objdump -D simple.elf command.

Summary

That's it. The minimal working GCC setup for K64F consists of just couple small files. It's a good starting point for developing more complex projects as well as a good exercise before analyzing large SDKs.

The project is available on bitbucket.

Friday 27 November 2015

Why does FUSE on Android suck?

Introduction

FUSE (Filesystem in Userspace) is a very useful mechanism in many applications. The thing is, those applications should not be focused on performance in terms of actual data transfers. FUSE has many advantages implied by userspace sandboxing, but for sure performance wasn't the main design consideration. I'm not telling that it is a bad design or something wrong with FUSE itself. It is just focused on other aspects like security, stability and easiness of creating applications. The problem I'd like to discuss here is that Google decided to use FUSE as a frontend to actual data stored on the non-volatile memory.

FUSE has been introduced in Android 4.4 to handle "emulated" storage. Before that, "emulated" storage path was mounted as VFAT. Here's how it looked on old ICS (output of mount command):

/dev/block/vold/179:14 /mnt/sdcard vfat rw,dirsync,nosuid,nodev,noexec,relatime,uid=1000,gid=1015,fmask=0702,dmask=0702,allow_utime=0020,codepage=cp437,iocharset=iso8859-1,shortname=mixed,utf8,errors=remount-ro 0 0

Don't be confused by "sdcard" directory name. It is still internal flash. External storage is usually mounted as something like "sdcard1".

This kind of partition was needed because of compatibility reasons. The applications can store data no matter if it's internal or external flash. In case of storing data on external SD cards, system has to deal usually with FAT32 filesystem. FAT32 is quite different than EXT4 used by Android internally. For instance, it's not case sensitive and doesn't handle discretionary access control.

Because of adding more Android specific permissions, Google decided to use FUSE to emulate FAT32:

/dev/fuse /mnt/shell/emulated fuse rw,nosuid,nodev,noexec,relatime,user_id=1023,group_id=1023,default_permissions,allow_other 0 0

FUSE

So, how does it work on Android?

First of all, there is a FUSE support enabled in kernel. Complementarily, there is a userspace daemon called "sdcard". On boot, the sdcard daemon mounts a /dev/fuse device to the emulated directory:

1743static int fuse_setup(struct fuse* fuse, gid_t gid, mode_t mask) {
1744    char opts[256];
1745
1746    fuse->fd = open("/dev/fuse", O_RDWR);
1747    if (fuse->fd == -1) {
1748        ERROR("failed to open fuse device: %s\n", strerror(errno));
1749        return -1;
1750    }
1751
1752    umount2(fuse->dest_path, MNT_DETACH);
1753
1754    snprintf(opts, sizeof(opts),
1755            "fd=%i,rootmode=40000,default_permissions,allow_other,user_id=%d,group_id=%d",
1756            fuse->fd, fuse->global->uid, fuse->global->gid);
1757    if (mount("/dev/fuse", fuse->dest_path, "fuse", MS_NOSUID | MS_NODEV | MS_NOEXEC |
1758            MS_NOATIME, opts) != 0) {
1759        ERROR("failed to mount fuse filesystem: %s\n", strerror(errno));
1760        return -1;
1761    }
1762
1763    fuse->gid = gid;
1764    fuse->mask = mask;
1765
1766    return 0;
1767}

After that, it polls on FUSE device waiting for messages from the kernel:

1581static void handle_fuse_requests(struct fuse_handler* handler)
1582{
1583    struct fuse* fuse = handler->fuse;
1584    for (;;) {
1585        ssize_t len = TEMP_FAILURE_RETRY(read(fuse->fd,
1586                handler->request_buffer, sizeof(handler->request_buffer)));
1587        if (len < 0) {
1588            if (errno == ENODEV) {
1589                ERROR("[%d] someone stole our marbles!\n", handler->token);
1590                exit(2);
1591            }
1592            ERROR("[%d] handle_fuse_requests: errno=%d\n", handler->token, errno);
1593            continue;
1594        }
(..)

Since now, every file operation inside directory mounted through FUSE will be handled in a specific way. For example, let's say we'd like to read file "test.txt" located at /sdcard/test.txt. Note again: "sdcard" means internal flash.


root@android: # cd /sdcard
root@android:/sdcard # cat test.txt

We expect cat to issue open(), read() and close() system calls during that operation. Let's have a look at what we get using strace:

root@android:/sdcard # strace -f -e open,openat,read,close cat test.txt
(..)
>>stripped output related to loading "cat" by shell<<
(..)                             = 0
openat(AT_FDCWD, "test.txt", O_RDONLY)  = 3
read(3, "1234\n", 1024)                 = 5
read(3, "", 1024)                       = 0
close(3)                                = 0

Looks ok, but hey, what is sdcard daemon doing in the meantime? Strace sdcard in the same time:

root@android: # ps | grep sdcard
media_rw  714   1     23096  1528  ffffffff 81ca6254 S /system/bin/sdcard
root@android: # strace -f -p 714 
Process 714 attached with 3 threads
[pid   916] read(3,  <unfinished ...>
[pid   915] read(3,  <unfinished ...>
[pid   714] read(4,  <unfinished ...>
[pid   916] <... read resumed> "1\0\0\0\1\0\0\0\2\234\3\0\0\0\0\0\200\200@\200\177\0\0\0\0\0\0\0\0\0\0\0"..., 262224) = 49
[pid   916] faccessat(AT_FDCWD, "/data/media/0/test.txt", F_OK) = 0
[pid   916] newfstatat(AT_FDCWD, "/data/media/0/test.txt", {st_mode=S_IFREG|0664, st_size=5, ...}, AT_SYMLINK_NOFOLLOW) = 0
[pid   916] writev(3, [{"\220\0\0\0\0\0\0\0\2\234\3\0\0\0\0\0", 16}, {"\200\261\317\200\177\0\0\0\223(\0\0\0\0\0\0\n\0\0\0\0\0\0\0\n\0\0\0\0\0\0\0"..., 128}], 2) = 144
[pid   915] <... read resumed> "0\0\0\0\16\0\0\0\3\234\3\0\0\0\0\0\200\261\317\200\177\0\0\0\0\0\0\0\0\0\0\0"..., 262224) = 48
[pid   916] read(3,  <unfinished ...>
[pid   915] openat(AT_FDCWD, "/data/media/0/test.txt", O_RDONLY|O_LARGEFILE) = 5
[pid   915] writev(3, [{" \0\0\0\0\0\0\0\3\234\3\0\0\0\0\0", 16}, {"\260p\300\200\177\0\0\0\0\0\0\0\0\0\0\0", 16}], 2 <unfinished ...>
[pid   916] <... read resumed> "P\0\0\0\17\0\0\0\4\234\3\0\0\0\0\0\200\261\317\200\177\0\0\0\0\0\0\0\0\0\0\0"..., 262224) = 80
[pid   915] <... writev resumed> )      = 32
[pid   916] pread64(5,  <unfinished ...>
[pid   915] read(3,  <unfinished ...>
[pid   916] <... pread64 resumed> "1234\n", 4096, 0) = 5
[pid   916] writev(3, [{"\25\0\0\0\0\0\0\0\4\234\3\0\0\0\0\0", 16}, {"1234\n", 5}], 2) = 21
[pid   915] <... read resumed> "8\0\0\0\3\0\0\0\5\234\3\0\0\0\0\0\200\261\317\200\177\0\0\0\0\0\0\0\0\0\0\0"..., 262224) = 56
[pid   916] read(3,  <unfinished ...>
[pid   915] newfstatat(AT_FDCWD, "/data/media/0/test.txt", {st_mode=S_IFREG|0664, st_size=5, ...}, AT_SYMLINK_NOFOLLOW) = 0
[pid   915] writev(3, [{"x\0\0\0\0\0\0\0\5\234\3\0\0\0\0\0", 16}, {"\n\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\224(\0\0\0\0\0\0\5\0\0\0\0\0\0\0"..., 104}], 2) = 120
[pid   916] <... read resumed> "@\0\0\0\31\0\0\0\6\234\3\0\0\0\0\0\200\261\317\200\177\0\0\0\0\0\0\0\0\0\0\0"..., 262224) = 64
[pid   916] write(3, "\20\0\0\0\0\0\0\0\6\234\3\0\0\0\0\0", 16) = 16
[pid   916] read(3, "@\0\0\0\22\0\0\0\7\234\3\0\0\0\0\0\200\261\317\200\177\0\0\0\0\0\0\0\0\0\0\0"..., 262224) = 64
[pid   916] close(5)                    = 0
[pid   916] write(3, "\20\0\0\0\0\0\0\0\7\234\3\0\0\0\0\0", 16) = 16
[pid   916] read(3,  <unfinished ...>
[pid   915] read(3, ^CProcess 714 detached
Process 915 detached

A lot is happening. This is because each file operation will now work in a following way:
  1. Userspace application issues system call that will be handled by FUSE driver in kernel (we see it in the first strace output)
  2. FUSE driver in kernel notifies userspace daemon (sdcard) about new request
  3. Userspace daemon reads /dev/fuse
  4. Userspace daemon parses command and recognizes file operation (ex. open)
  5. Userspace daemon issues system call to the actual filesystem (EXT4)
  6. Kernel handles physical data access and sends data back to the userspace
  7. Userspace modifies (or not) data and passes it through /dev/fuse to kernel again
  8. Kernel completes original system call and moves data to the actual userspace application (in our example cat)
Uff, that's a lot, isn't it? 

Performance

Let's see what side effects such attitude has. The obvious one is performance overhead for each additional system call. Here are numbers (all tests were performed several times, each time similar results were observed):

Test #1: Copy big file within one partition.

EXT4 FS:

root@android:/data # echo 3 > /proc/sys/vm/drop_caches
root@android:/data # dd if=bigbuck.in of=bigbuck.out bs=1m                      
691+1 records in
691+1 records out
725106140 bytes transferred in 10.779 secs (67270260 bytes/sec)

FUSE:

root@android:/sdcard # echo 3 > /proc/sys/vm/drop_caches                      
root@android:/sdcard # dd if=bigbuck.in of=bigbuck.out bs=1m                  
691+1 records in
691+1 records out
725106140 bytes transferred in 13.031 secs (55644704 bytes/sec)

RESULT:

In this test, FUSE is about 17% slower.

Test #2: Copy a lot of small files within one partition. There were 10 000 files each one 5kB of size.

EXT4 FS:

root@android:/data # echo 3 > /proc/sys/vm/drop_caches
root@android:/data # time cp small/* small2/                                  
    0m17.27s real     0m0.32s user     0m6.07s system

FUSE:

root@android:/sdcard # echo 3 > /proc/sys/vm/drop_caches                      
root@android:/sdcard # time cp small/* small2/                                
    1m3.03s real     0m1.05s user     0m9.59s system

RESULT:

I think the comment is superfluous. It took over 1 minute (!) to copy ~50MB of small files on FUSE mounted partition in comparison to ~17 seconds on EXT4 FS.

Double caching

Another implication is double caching of data.  Linux Kernel uses page cache mechanism to store recently accessed data in memory, specifically data from a non-volatile storage. This greatly improves data access performance. However, we don't want to have the same data cached twice. Unfortunately, this will happen because of the way in which FUSE is used on Android.

Observing double caching behavior caused by FUSE is very simple:
  1. Create file with a known size
  2. Copy it into /sdcard folder on the phone (/sdcard is a symlink to /storage/emulated/legacy which is a symlink to /mnt/shell/emulated/0 which is mounted as FUSE)
  3. Drop page cache -> take a snapshot of page cache usage -> read test file -> take another snapshot of page cache -> see a difference between page cache usage before and after reading the file:
root@android: # echo 3 > /proc/sys/vm/drop_caches ; sleep 1 ; cat /proc/meminfo | grep Cache ; cat /sdcard/test_file > /dev/null ; cat /proc/meminfo | grep Cache

If size of the file is for example 10MB we'll get something like this:

before file operation:  

Cached: 241864 kB

after file operation: 

Cached: 263072 kB

Expected result would be 10MB more than 241MB in cache, so something around 251MB. Instead, we see 263MB in cache after reading 10MB of data. It means kernel cached twice as needed. The same test performed directly on EXT4 FS (for instance in /data folder) will show, as expected, 10MB more of cached pages.

So, we have the same data cached twice. Once as a user application that issued original open/read system call and once as "sdcard" daemon. First data is cached by FUSE, second one by EXT4 FS.

When I first noticed it I tried to force FUSE to skip caching. Here are my notes from that time:
We can skip fuse cache by providing FOPEN_DIRECT_IO inside kernel. I tested this solution, however it affected performance significantly. Although caching works ok (meaning there is only one copy of data in cache and subsequent reads doesn't generate i/o to the flash) there is additional overhead for switching more often between sdcard daemon and fuse fs in kernel. Maybe it can be tweaked more.
There is FOPEN_KEEP_CACHE in fuse that might be useful – it needs more investigation.
Other solution is to provide O_DIRECT flag in sdcard daemon when it's opening ext4fs files. We then discard caches from ext4fs and we should be able to use page cache created by fuse. However, using O_DIRECT requires user buffers to be aligned in memory to the block size. Also the size of data chunks should be aligned. Sdcard daemon is prepared for external O_DIRECT requests by Google:
https://android-review.googlesource.com/#/c/82141/4/
https://android-review.googlesource.com/#/c/82570/ . The possible solution would be to enable KEEP_CACHE in Fuse on kernel side and use O_DIRECT to all sdcard daemon requests. I did it for ‘read’ case and it works, however there is a significant overhead for the first read of data. Subsequent reads are much faster than originally (due to caching in fuse). Using it for writes may be tricky though.

Another way to solve it is to provide POSIX_FADV_DONTNEED fadvise in sdcard dameon. I tested it as well, however again - it affects performance too much.
Basically, the most important conclusion from above investigation was: get rid of FUSE and implement FAT32 emulation layer inside kernel.

Other issues

Beside performance and double caching, there are other problems with FUSE on Android. For instance, not all features from FAT32 are implemented in sdcard daemon. There were issues with utime() system call and with lack of full support for O_DIRECT flag.

I don't want to blame Google only. As officialy stated:
"Devices may provide external storage by emulating a case-insensitive, permissionless filesystem backed by internal storage. One possible implementation is provided by the FUSE daemon in system/core/sdcard, which can be added as a device-specific init.rc service". 
FUSE daemon is only example implementation that is easiest to maintain, but it has also a lot of drawbacks.
What's more interesting, some mobile vendors (Samsung, Motorola) have already realized it and replaced FUSE with their own in-kernel (or mixed) implementation. Samsung has created driver based on WrapFS called "sdcardfs". In my opinion it's the best approach: use WrapFS to implement FAT32 emulation layer inside kernel. If Samsung implemented it correctly it's another question, but from what I saw in officially published Samsung kernel sources it's not so bad.

Summary

To sum-up why does FUSE on Android suck:
  • Performance
  • Double caching
  • Several other minor defects, like missing allow_utime flag
Note, Android as an operating system doesn't access files via FUSE internally. However, high-level applications do. Use cases like saving photos from camera, recording videos or reading offline maps will suffer the most from FUSE drawbacks described in this article.