diff options
author | Gregory Nutt <gnutt@nuttx.org> | 2014-05-06 14:58:48 -0600 |
---|---|---|
committer | Gregory Nutt <gnutt@nuttx.org> | 2014-05-06 14:58:48 -0600 |
commit | ac164620963d88128b5a3d52ee073468193e0d2b (patch) | |
tree | 8b82c291460978eea744c44343dbfe5fb90f2984 | |
parent | 9a648eeafd3bcb0d9f6b7519a279a78973f466a4 (diff) | |
download | nuttx-ac164620963d88128b5a3d52ee073468193e0d2b.tar.gz nuttx-ac164620963d88128b5a3d52ee073468193e0d2b.tar.bz2 nuttx-ac164620963d88128b5a3d52ee073468193e0d2b.zip |
Optimized memcpy() functin for the ARMv7-A from David Sidrane
-rw-r--r-- | nuttx/ChangeLog | 3 | ||||
-rw-r--r-- | nuttx/arch/arm/src/a1x/Make.defs | 4 | ||||
-rw-r--r-- | nuttx/arch/arm/src/armv7-a/arm_memcpy.S | 416 | ||||
-rw-r--r-- | nuttx/arch/arm/src/sama5/Make.defs | 4 |
4 files changed, 427 insertions, 0 deletions
diff --git a/nuttx/ChangeLog b/nuttx/ChangeLog index ccc431f9e..376edcdcc 100644 --- a/nuttx/ChangeLog +++ b/nuttx/ChangeLog @@ -7297,3 +7297,6 @@ * arch/arm/src/stm32/stm32f40xxx_rcc.c and configs/nucleo-f401re/include/board.h: The Nucleo-F401RE has no on-board crystals and, hence, needs to run with the PLL input from the on-chip HSI clock (2014-5-6). + * arch/arm/src/armv7-a/arm_memcpy.S: This is the same optimized memcpy() + function that Mike Smith brought in for the ARMv7-M with minor tweaks + by David Sidrane to work with the ARMv7-A (2014-5-6). diff --git a/nuttx/arch/arm/src/a1x/Make.defs b/nuttx/arch/arm/src/a1x/Make.defs index 9acf9ab60..aac0e5fc3 100644 --- a/nuttx/arch/arm/src/a1x/Make.defs +++ b/nuttx/arch/arm/src/a1x/Make.defs @@ -54,6 +54,10 @@ CMN_ASRCS += arm_saveusercontext.S arm_vectoraddrexcptn.S arm_vfork.S CMN_ASRCS += cp15_coherent_dcache.S cp15_invalidate_dcache.S CMN_ASRCS += cp15_clean_dcache.S cp15_flush_dcache.S cp15_invalidate_dcache_all.S +ifeq ($(CONFIG_ARCH_MEMCPY),y) +CMN_ASRCS += arm_memcpy.S +endif + # Common C source files CMN_CSRCS = up_initialize.c up_idle.c up_interruptcontext.c up_exit.c diff --git a/nuttx/arch/arm/src/armv7-a/arm_memcpy.S b/nuttx/arch/arm/src/armv7-a/arm_memcpy.S new file mode 100644 index 000000000..a6600659a --- /dev/null +++ b/nuttx/arch/arm/src/armv7-a/arm_memcpy.S @@ -0,0 +1,416 @@ +/************************************************************************************ + * nuttx/arch/arm/src/armv7-m/up_memcpy.S + * + * armv7m-optimised memcpy, contributed by Mike Smith. Apparently in the public + * domain and is re-released here under the modified BSD license: + * + * Obtained via a posting on the Stellaris forum: + * http://e2e.ti.com/support/microcontrollers/\ + * stellaris_arm_cortex-m3_microcontroller/f/473/t/44360.aspx + * + * Posted by rocksoft on Jul 24, 2008 10:19 AM + * + * Hi, + * + * I recently finished a "memcpy" replacement and thought it might be useful for + * others... + * + * I've put some instructions and the code here: + * + * http://www.rock-software.net/downloads/memcpy/ + * + * Hope it works for you as well as it did for me. + * + * Liam. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name NuttX nor the names of its contributors may be + * used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + ************************************************************************************/ + +/************************************************************************************ + * Global Symbols + ************************************************************************************/ + + .global memcpy + + .syntax unified + .thumb + + .file "arm_memcpy.S" + +/************************************************************************************ + * .text + ************************************************************************************/ + + .text + +/************************************************************************************ + * Private Constant Data + ************************************************************************************/ + +/* We have 16 possible alignment combinations of src and dst, this jump table + * directs the copy operation + * + * Bits: Src=00, Dst=00 - Long to Long copy + * Bits: Src=00, Dst=01 - Long to Byte before half word + * Bits: Src=00, Dst=10 - Long to Half word + * Bits: Src=00, Dst=11 - Long to Byte before long word + * Bits: Src=01, Dst=00 - Byte before half word to long + * Bits: Src=01, Dst=01 - Byte before half word to byte before half word - + * Same alignment + * Bits: Src=01, Dst=10 - Byte before half word to half word + * Bits: Src=01, Dst=11 - Byte before half word to byte before long word + * Bits: Src=10, Dst=00 - Half word to long word + * Bits: Src=10, Dst=01 - Half word to byte before half word + * Bits: Src=10, Dst=10 - Half word to half word - Same Alignment + * Bits: Src=10, Dst=11 - Half word to byte before long word + * Bits: Src=11, Dst=00 - Byte before long word to long word + * Bits: Src=11, Dst=01 - Byte before long word to byte before half word + * Bits: Src=11, Dst=11 - Byte before long word to half word + * Bits: Src=11, Dst=11 - Byte before long word to Byte before long word - + * Same alignment + */ + +MEM_DataCopyTable: + .byte (MEM_DataCopy0 - MEM_DataCopyJump) >> 1 + .byte (MEM_DataCopy1 - MEM_DataCopyJump) >> 1 + .byte (MEM_DataCopy2 - MEM_DataCopyJump) >> 1 + .byte (MEM_DataCopy3 - MEM_DataCopyJump) >> 1 + .byte (MEM_DataCopy4 - MEM_DataCopyJump) >> 1 + .byte (MEM_DataCopy5 - MEM_DataCopyJump) >> 1 + .byte (MEM_DataCopy6 - MEM_DataCopyJump) >> 1 + .byte (MEM_DataCopy7 - MEM_DataCopyJump) >> 1 + .byte (MEM_DataCopy8 - MEM_DataCopyJump) >> 1 + .byte (MEM_DataCopy9 - MEM_DataCopyJump) >> 1 + .byte (MEM_DataCopy10 - MEM_DataCopyJump) >> 1 + .byte (MEM_DataCopy11 - MEM_DataCopyJump) >> 1 + .byte (MEM_DataCopy12 - MEM_DataCopyJump) >> 1 + .byte (MEM_DataCopy13 - MEM_DataCopyJump) >> 1 + .byte (MEM_DataCopy14 - MEM_DataCopyJump) >> 1 + .byte (MEM_DataCopy15 - MEM_DataCopyJump) >> 1 + + .align 2 + +MEM_LongCopyTable: + .byte (MEM_LongCopyEnd - MEM_LongCopyJump) >> 1 /* 0 bytes left */ + .byte 0 /* 4 bytes left */ + .byte (1 * 10) >> 1 /* 8 bytes left */ + .byte (2 * 10) >> 1 /* 12 bytes left */ + .byte (3 * 10) >> 1 /* 16 bytes left */ + .byte (4 * 10) >> 1 /* 20 bytes left */ + .byte (5 * 10) >> 1 /* 24 bytes left */ + .byte (6 * 10) >> 1 /* 28 bytes left */ + .byte (7 * 10) >> 1 /* 32 bytes left */ + .byte (8 * 10) >> 1 /* 36 bytes left */ + + .align 2 + +/************************************************************************************ + * Public Functions + ************************************************************************************/ +/************************************************************************************ + * Name: memcpy + * + * Description: + * Optimised "general" copy routine + * + * Input Parameters: + * r0 = destination, r1 = source, r2 = length + * + ************************************************************************************/ + + .thumb_func +memcpy: + push {r14} + + /* This allows the inner workings to "assume" a minimum amount of bytes */ + /* Quickly check for very short copies */ + + cmp r2, #4 + blt.n MEM_DataCopyBytes + + and r14, r0, #3 /* Get destination alignment bits */ + bfi r14, r1, #2, #2 /* Get source alignment bits */ + ldr r3, =MEM_DataCopyTable /* Jump table base */ + tbb [r3, r14] /* Perform jump on src/dst alignment bits */ +MEM_DataCopyJump: + + .align 4 + +/* Bits: Src=01, Dst=01 - Byte before half word to byte before half word - Same alignment + * 3 bytes to read for long word aligning + */ + +MEM_DataCopy5: + ldrb r3, [r1], #0x01 + strb r3, [r0], #0x01 + sub r2, r2, #0x01 + +/* Bits: Src=10, Dst=10 - Half word to half word - Same Alignment + * 2 bytes to read for long word aligning + */ + +MEM_DataCopy10: + ldrb r3, [r1], #0x01 + strb r3, [r0], #0x01 + sub r2, r2, #0x01 + +/* Bits: Src=11, Dst=11 - Byte before long word to Byte before long word - Same alignment + * 1 bytes to read for long word aligning + */ + +MEM_DataCopy15: + ldrb r3, [r1], #0x01 + strb r3, [r0], #0x01 + sub r2, r2, #0x01 + +/* Bits: Src=00, Dst=00 - Long to Long copy */ + +MEM_DataCopy0: + /* Save regs that may be used by memcpy */ + + push {r4-r12} + + /* Check for short word-aligned copy */ + + cmp r2, #0x28 + blt.n MEM_DataCopy0_2 + + /* Bulk copy loop */ + +MEM_DataCopy0_1: + ldmia r1!, {r3-r12} + stmia r0!, {r3-r12} + sub r2, r2, #0x28 + cmp r2, #0x28 + bge.n MEM_DataCopy0_1 + + /* Copy remaining long words */ + +MEM_DataCopy0_2: + /* Copy remaining long words */ + + ldr r14, =MEM_LongCopyTable + lsr r11, r2, #0x02 + tbb [r14, r11] + + /* longword copy branch table anchor */ + +MEM_LongCopyJump: + ldr.w r3, [r1], #0x04 /* 4 bytes remain */ + str.w r3, [r0], #0x04 + b.n MEM_LongCopyEnd + ldmia.w r1!, {r3-r4} /* 8 bytes remain */ + stmia.w r0!, {r3-r4} + b.n MEM_LongCopyEnd + ldmia.w r1!, {r3-r5} /* 12 bytes remain */ + stmia.w r0!, {r3-r5} + b.n MEM_LongCopyEnd + ldmia.w r1!, {r3-r6} /* 16 bytes remain */ + stmia.w r0!, {r3-r6} + b.n MEM_LongCopyEnd + ldmia.w r1!, {r3-r7} /* 20 bytes remain */ + stmia.w r0!, {r3-r7} + b.n MEM_LongCopyEnd + ldmia.w r1!, {r3-r8} /* 24 bytes remain */ + stmia.w r0!, {r3-r8} + b.n MEM_LongCopyEnd + ldmia.w r1!, {r3-r9} /* 28 bytes remain */ + stmia.w r0!, {r3-r9} + b.n MEM_LongCopyEnd + ldmia.w r1!, {r3-r10} /* 32 bytes remain */ + stmia.w r0!, {r3-r10} + b.n MEM_LongCopyEnd + ldmia.w r1!, {r3-r11} /* 36 bytes remain */ + stmia.w r0!, {r3-r11} + +MEM_LongCopyEnd: + pop {r4-r12} + and r2, r2, #0x03 /* All the longs have been copied */ + + /* Deal with up to 3 remaining bytes */ + +MEM_DataCopyBytes: + /* Deal with up to 3 remaining bytes */ + + cmp r2, #0x00 + it eq + popeq {pc} + ldrb r3, [r1], #0x01 + strb r3, [r0], #0x01 + subs r2, r2, #0x01 + it eq + popeq {pc} + ldrb r3, [r1], #0x01 + strb r3, [r0], #0x01 + subs r2, r2, #0x01 + it eq + popeq {pc} + ldrb r3, [r1], #0x01 + strb r3, [r0], #0x01 + pop {pc} + + .align 4 + +/* Bits: Src=01, Dst=11 - Byte before half word to byte before long word + * 3 bytes to read for long word aligning the source + */ + +MEM_DataCopy7: + ldrb r3, [r1], #0x01 + strb r3, [r0], #0x01 + sub r2, r2, #0x01 + +/* Bits: Src=10, Dst=00 - Half word to long word + * 2 bytes to read for long word aligning the source + */ + +MEM_DataCopy8: + ldrb r3, [r1], #0x01 + strb r3, [r0], #0x01 + sub r2, r2, #0x01 + +/* Bits: Src=11, Dst=01 - Byte before long word to byte before half word + * 1 byte to read for long word aligning the source + */ + +MEM_DataCopy13: + ldrb r3, [r1], #0x01 + strb r3, [r0], #0x01 + sub r2, r2, #0x01 + +/* Bits: Src=00, Dst=10 - Long to Half word */ + +MEM_DataCopy2: + cmp r2, #0x28 + blt.n MEM_DataCopy2_1 + + /* Save regs */ + + push {r4-r12} + + /* Bulk copy loop */ + +MEM_DataCopy2_2: + ldmia r1!, {r3-r12} + + strh r3, [r0], #0x02 + + lsr r3, r3, #0x10 + bfi r3, r4, #0x10, #0x10 + lsr r4, r4, #0x10 + bfi r4, r5, #0x10, #0x10 + lsr r5, r5, #0x10 + bfi r5, r6, #0x10, #0x10 + lsr r6, r6, #0x10 + bfi r6, r7, #0x10, #0x10 + lsr r7, r7, #0x10 + bfi r7, r8, #0x10, #0x10 + lsr r8, r8, #0x10 + bfi r8, r9, #0x10, #0x10 + lsr r9, r9, #0x10 + bfi r9, r10, #0x10, #0x10 + lsr r10, r10, #0x10 + bfi r10, r11, #0x10, #0x10 + lsr r11, r11, #0x10 + bfi r11, r12, #0x10, #0x10 + stmia r0!, {r3-r11} + lsr r12, r12, #0x10 + strh r12, [r0], #0x02 + + sub r2, r2, #0x28 + cmp r2, #0x28 + bge.n MEM_DataCopy2_2 + pop {r4-r12} + +MEM_DataCopy2_1: /* Read longs and write 2 x half words */ + cmp r2, #4 + blt.n MEM_DataCopyBytes + ldr r3, [r1], #0x04 + strh r3, [r0], #0x02 + lsr r3, r3, #0x10 + strh r3, [r0], #0x02 + sub r2, r2, #0x04 + b.n MEM_DataCopy2 + +/* Bits: Src=01, Dst=00 - Byte before half word to long + * Bits: Src=01, Dst=10 - Byte before half word to half word + * 3 bytes to read for long word aligning the source + */ + +MEM_DataCopy4: +MEM_DataCopy6: + /* Read B and write B */ + + ldrb r3, [r1], #0x01 + strb r3, [r0], #0x01 + sub r2, r2, #0x01 + +/* Bits: Src=10, Dst=01 - Half word to byte before half word + * Bits: Src=10, Dst=11 - Half word to byte before long word + * 2 bytes to read for long word aligning the source + */ + +MEM_DataCopy9: +MEM_DataCopy11: + ldrb r3, [r1], #0x01 + strb r3, [r0], #0x01 + sub r2, r2, #0x01 + +/* Bits: Src=11, Dst=00 -chm Byte before long word to long word + * Bits: Src=11, Dst=11 - Byte before long word to half word + * 1 byte to read for long word aligning the source + */ + +MEM_DataCopy12: +MEM_DataCopy14: + /* Read B and write B */ + + ldrb r3, [r1], #0x01 + strb r3, [r0], #0x01 + sub r2, r2, #0x01 + +/* Bits: Src=00, Dst=01 - Long to Byte before half word + * Bits: Src=00, Dst=11 - Long to Byte before long word + */ + +MEM_DataCopy1: /* Read longs, write B->H->B */ +MEM_DataCopy3: + cmp r2, #4 + blt MEM_DataCopyBytes + ldr r3, [r1], #0x04 + strb r3, [r0], #0x01 + lsr r3, r3, #0x08 + strh r3, [r0], #0x02 + lsr r3, r3, #0x10 + strb r3, [r0], #0x01 + sub r2, r2, #0x04 + b.n MEM_DataCopy3 + + .size memcpy, .-memcpy + .end diff --git a/nuttx/arch/arm/src/sama5/Make.defs b/nuttx/arch/arm/src/sama5/Make.defs index 181abe1ed..0183f7565 100644 --- a/nuttx/arch/arm/src/sama5/Make.defs +++ b/nuttx/arch/arm/src/sama5/Make.defs @@ -54,6 +54,10 @@ CMN_ASRCS += arm_saveusercontext.S arm_vectoraddrexcptn.S arm_vfork.S CMN_ASRCS += cp15_coherent_dcache.S cp15_invalidate_dcache.S CMN_ASRCS += cp15_clean_dcache.S cp15_flush_dcache.S cp15_invalidate_dcache_all.S +ifeq ($(CONFIG_ARCH_MEMCPY),y) +CMN_ASRCS += arm_memcpy.S +endif + # Common C source files CMN_CSRCS = up_initialize.c up_idle.c up_interruptcontext.c up_exit.c |