Mailing List Archive: [PATCH RFC 11/25] arm: bit manipulation, copy and division libraries

From: Stefano Stabellini <stefano.stabellini@eu.citrix.com>

Bit manipulation, division and memcpy & friends implementations for the
ARM architecture, shamelessly taken from Linux.

Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Tim Deegan <Tim.Deegan@citrix.com>
---
xen/arch/arm/lib/Makefile | 5 +
xen/arch/arm/lib/assembler.h | 49 +++++++
xen/arch/arm/lib/bitops.h | 36 +++++
xen/arch/arm/lib/changebit.S | 18 +++
xen/arch/arm/lib/clearbit.S | 19 +++
xen/arch/arm/lib/copy_template.S | 266 ++++++++++++++++++++++++++++++++++++
xen/arch/arm/lib/div64.S | 149 +++++++++++++++++++++
xen/arch/arm/lib/findbit.S | 115 ++++++++++++++++
xen/arch/arm/lib/lib1funcs.S | 274 ++++++++++++++++++++++++++++++++++++++
xen/arch/arm/lib/memcpy.S | 64 +++++++++
xen/arch/arm/lib/memmove.S | 200 +++++++++++++++++++++++++++
xen/arch/arm/lib/memset.S | 129 ++++++++++++++++++
xen/arch/arm/lib/memzero.S | 127 ++++++++++++++++++
xen/arch/arm/lib/setbit.S | 18 +++
xen/arch/arm/lib/testchangebit.S | 18 +++
xen/arch/arm/lib/testclearbit.S | 18 +++
xen/arch/arm/lib/testsetbit.S | 18 +++
17 files changed, 1523 insertions(+), 0 deletions(-)
create mode 100644 xen/arch/arm/lib/Makefile
create mode 100644 xen/arch/arm/lib/assembler.h
create mode 100644 xen/arch/arm/lib/bitops.h
create mode 100644 xen/arch/arm/lib/changebit.S
create mode 100644 xen/arch/arm/lib/clearbit.S
create mode 100644 xen/arch/arm/lib/copy_template.S
create mode 100644 xen/arch/arm/lib/div64.S
create mode 100644 xen/arch/arm/lib/findbit.S
create mode 100644 xen/arch/arm/lib/lib1funcs.S
create mode 100644 xen/arch/arm/lib/memcpy.S
create mode 100644 xen/arch/arm/lib/memmove.S
create mode 100644 xen/arch/arm/lib/memset.S
create mode 100644 xen/arch/arm/lib/memzero.S
create mode 100644 xen/arch/arm/lib/setbit.S
create mode 100644 xen/arch/arm/lib/testchangebit.S
create mode 100644 xen/arch/arm/lib/testclearbit.S
create mode 100644 xen/arch/arm/lib/testsetbit.S

diff --git a/xen/arch/arm/lib/Makefile b/xen/arch/arm/lib/Makefile
new file mode 100644
index 0000000..cbbed68
--- /dev/null
+++ b/xen/arch/arm/lib/Makefile
@@ -0,0 +1,5 @@
+obj-y += memcpy.o memmove.o memset.o memzero.o
+obj-y += findbit.o setbit.o
+obj-y += setbit.o clearbit.o changebit.o
+obj-y += testsetbit.o testclearbit.o testchangebit.o
+obj-y += lib1funcs.o div64.o
diff --git a/xen/arch/arm/lib/assembler.h b/xen/arch/arm/lib/assembler.h
new file mode 100644
index 0000000..f8f0961
--- /dev/null
+++ b/xen/arch/arm/lib/assembler.h
@@ -0,0 +1,49 @@
+#ifndef __ARCH_ARM_LIB_ASSEMBLER_H__
+#define __ARCH_ARM_LIB_ASSEMBLER_H__
+
+/* From Linux arch/arm/include/asm/assembler.h */
+/*
+ * Data preload for architectures that support it
+ */
+#define PLD(code...) code
+
+/*
+ * This can be used to enable code to cacheline align the destination
+ * pointer when bulk writing to memory. Experiments on StrongARM and
+ * XScale didn't show this a worthwhile thing to do when the cache is not
+ * set to write-allocate (this would need further testing on XScale when WA
+ * is used).
+ *
+ * On Feroceon there is much to gain however, regardless of cache mode.
+ */
+#ifdef CONFIG_CPU_FEROCEON /* Not in Xen... */
+#define CALGN(code...) code
+#else
+#define CALGN(code...)
+#endif
+
+// No Thumb, hence:
+#define W(instr) instr
+#define ARM(instr...) instr
+#define THUMB(instr...)
+
+#ifdef CONFIG_ARM_UNWIND
+#define UNWIND(code...) code
+#else
+#define UNWIND(code...)
+#endif
+
+#define pull lsl
+#define push lsr
+#define get_byte_0 lsr #24
+#define get_byte_1 lsr #16
+#define get_byte_2 lsr #8
+#define get_byte_3 lsl #0
+#define put_byte_0 lsl #24
+#define put_byte_1 lsl #16
+#define put_byte_2 lsl #8
+#define put_byte_3 lsl #0
+
+#define smp_dmb dmb
+
+#endif /* __ARCH_ARM_LIB_ASSEMBLER_H__ */
diff --git a/xen/arch/arm/lib/bitops.h b/xen/arch/arm/lib/bitops.h
new file mode 100644
index 0000000..e56d4e8
--- /dev/null
+++ b/xen/arch/arm/lib/bitops.h
@@ -0,0 +1,36 @@
+ .macro bitop, instr
+ ands ip, r1, #3
+ strneb r1, [ip] @ assert word-aligned
+ mov r2, #1
+ and r3, r0, #31 @ Get bit offset
+ mov r0, r0, lsr #5
+ add r1, r1, r0, lsl #2 @ Get word offset
+ mov r3, r2, lsl r3
+1: ldrex r2, [r1]
+ \instr r2, r2, r3
+ strex r0, r2, [r1]
+ cmp r0, #0
+ bne 1b
+ bx lr
+ .endm
+
+ .macro testop, instr, store
+ ands ip, r1, #3
+ strneb r1, [ip] @ assert word-aligned
+ mov r2, #1
+ and r3, r0, #31 @ Get bit offset
+ mov r0, r0, lsr #5
+ add r1, r1, r0, lsl #2 @ Get word offset
+ mov r3, r2, lsl r3 @ create mask
+ smp_dmb
+1: ldrex r2, [r1]
+ ands r0, r2, r3 @ save old value of bit
+ \instr r2, r2, r3 @ toggle bit
+ strex ip, r2, [r1]
+ cmp ip, #0
+ bne 1b
+ smp_dmb
+ cmp r0, #0
+ movne r0, #1
+2: bx lr
+ .endm
diff --git a/xen/arch/arm/lib/changebit.S b/xen/arch/arm/lib/changebit.S
new file mode 100644
index 0000000..62954bc
--- /dev/null
+++ b/xen/arch/arm/lib/changebit.S
@@ -0,0 +1,18 @@
+/*
+ * linux/arch/arm/lib/changebit.S
+ *
+ * Copyright (C) 1995-1996 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <xen/config.h>
+
+#include "assembler.h"
+#include "bitops.h"
+ .text
+
+ENTRY(_change_bit)
+ bitop eor
+ENDPROC(_change_bit)
diff --git a/xen/arch/arm/lib/clearbit.S b/xen/arch/arm/lib/clearbit.S
new file mode 100644
index 0000000..42ce416
--- /dev/null
+++ b/xen/arch/arm/lib/clearbit.S
@@ -0,0 +1,19 @@
+/*
+ * linux/arch/arm/lib/clearbit.S
+ *
+ * Copyright (C) 1995-1996 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <xen/config.h>
+
+#include "assembler.h"
+#include "bitops.h"
+ .text
+
+ENTRY(_clear_bit)
+ bitop bic
+ENDPROC(_clear_bit)
diff --git a/xen/arch/arm/lib/copy_template.S b/xen/arch/arm/lib/copy_template.S
new file mode 100644
index 0000000..7f7f4d5
--- /dev/null
+++ b/xen/arch/arm/lib/copy_template.S
@@ -0,0 +1,266 @@
+/*
+ * linux/arch/arm/lib/copy_template.s
+ *
+ * Code template for optimized memory copy functions
+ *
+ * Author: Nicolas Pitre
+ * Created: Sep 28, 2005
+ * Copyright: MontaVista Software, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/*
+ * Theory of operation
+ * -------------------
+ *
+ * This file provides the core code for a forward memory copy used in
+ * the implementation of memcopy(), copy_to_user() and copy_from_user().
+ *
+ * The including file must define the following accessor macros
+ * according to the need of the given function:
+ *
+ * ldr1w ptr reg abort
+ *
+ * This loads one word from 'ptr', stores it in 'reg' and increments
+ * 'ptr' to the next word. The 'abort' argument is used for fixup tables.
+ *
+ * ldr4w ptr reg1 reg2 reg3 reg4 abort
+ * ldr8w ptr, reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
+ *
+ * This loads four or eight words starting from 'ptr', stores them
+ * in provided registers and increments 'ptr' past those words.
+ * The'abort' argument is used for fixup tables.
+ *
+ * ldr1b ptr reg cond abort
+ *
+ * Similar to ldr1w, but it loads a byte and increments 'ptr' one byte.
+ * It also must apply the condition code if provided, otherwise the
+ * "al" condition is assumed by default.
+ *
+ * str1w ptr reg abort
+ * str8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
+ * str1b ptr reg cond abort
+ *
+ * Same as their ldr* counterparts, but data is stored to 'ptr' location
+ * rather than being loaded.
+ *
+ * enter reg1 reg2
+ *
+ * Preserve the provided registers on the stack plus any additional
+ * data as needed by the implementation including this code. Called
+ * upon code entry.
+ *
+ * exit reg1 reg2
+ *
+ * Restore registers with the values previously saved with the
+ * 'preserv' macro. Called upon code termination.
+ *
+ * LDR1W_SHIFT
+ * STR1W_SHIFT
+ *
+ * Correction to be applied to the "ip" register when branching into
+ * the ldr1w or str1w instructions (some of these macros may expand to
+ * than one 32bit instruction in Thumb-2)
+ */
+
+ enter r4, lr
+
+ subs r2, r2, #4
+ blt 8f
+ ands ip, r0, #3
+ PLD( pld [r1, #0] )
+ bne 9f
+ ands ip, r1, #3
+ bne 10f
+
+1: subs r2, r2, #(28)
+ stmfd sp!, {r5 - r8}
+ blt 5f
+
+ CALGN( ands ip, r0, #31 )
+ CALGN( rsb r3, ip, #32 )
+ CALGN( sbcnes r4, r3, r2 ) @ C is always set here
+ CALGN( bcs 2f )
+ CALGN( adr r4, 6f )
+ CALGN( subs r2, r2, r3 ) @ C gets set
+ CALGN( add pc, r4, ip )
+
+ PLD( pld [r1, #0] )
+2: PLD( subs r2, r2, #96 )
+ PLD( pld [r1, #28] )
+ PLD( blt 4f )
+ PLD( pld [r1, #60] )
+ PLD( pld [r1, #92] )
+
+3: PLD( pld [r1, #124] )
+4: ldr8w r1, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
+ subs r2, r2, #32
+ str8w r0, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
+ bge 3b
+ PLD( cmn r2, #96 )
+ PLD( bge 4b )
+
+5: ands ip, r2, #28
+ rsb ip, ip, #32
+#if LDR1W_SHIFT > 0
+ lsl ip, ip, #LDR1W_SHIFT
+#endif
+ addne pc, pc, ip @ C is always clear here
+ b 7f
+6:
+ .rept (1 << LDR1W_SHIFT)
+ W(nop)
+ .endr
+ ldr1w r1, r3, abort=20f
+ ldr1w r1, r4, abort=20f
+ ldr1w r1, r5, abort=20f
+ ldr1w r1, r6, abort=20f
+ ldr1w r1, r7, abort=20f
+ ldr1w r1, r8, abort=20f
+ ldr1w r1, lr, abort=20f
+
+#if LDR1W_SHIFT < STR1W_SHIFT
+ lsl ip, ip, #STR1W_SHIFT - LDR1W_SHIFT
+#elif LDR1W_SHIFT > STR1W_SHIFT
+ lsr ip, ip, #LDR1W_SHIFT - STR1W_SHIFT
+#endif
+ add pc, pc, ip
+ nop
+ .rept (1 << STR1W_SHIFT)
+ W(nop)
+ .endr
+ str1w r0, r3, abort=20f
+ str1w r0, r4, abort=20f
+ str1w r0, r5, abort=20f
+ str1w r0, r6, abort=20f
+ str1w r0, r7, abort=20f
+ str1w r0, r8, abort=20f
+ str1w r0, lr, abort=20f
+
+ CALGN( bcs 2b )
+
+7: ldmfd sp!, {r5 - r8}
+
+8: movs r2, r2, lsl #31
+ ldr1b r1, r3, ne, abort=21f
+ ldr1b r1, r4, cs, abort=21f
+ ldr1b r1, ip, cs, abort=21f
+ str1b r0, r3, ne, abort=21f
+ str1b r0, r4, cs, abort=21f
+ str1b r0, ip, cs, abort=21f
+
+ exit r4, pc
+
+9: rsb ip, ip, #4
+ cmp ip, #2
+ ldr1b r1, r3, gt, abort=21f
+ ldr1b r1, r4, ge, abort=21f
+ ldr1b r1, lr, abort=21f
+ str1b r0, r3, gt, abort=21f
+ str1b r0, r4, ge, abort=21f
+ subs r2, r2, ip
+ str1b r0, lr, abort=21f
+ blt 8b
+ ands ip, r1, #3
+ beq 1b
+
+10: bic r1, r1, #3
+ cmp ip, #2
+ ldr1w r1, lr, abort=21f
+ beq 17f
+ bgt 18f
+
+
+ .macro forward_copy_shift pull push
+
+ subs r2, r2, #28
+ blt 14f
+
+ CALGN( ands ip, r0, #31 )
+ CALGN( rsb ip, ip, #32 )
+ CALGN( sbcnes r4, ip, r2 ) @ C is always set here
+ CALGN( subcc r2, r2, ip )
+ CALGN( bcc 15f )
+
+11: stmfd sp!, {r5 - r9}
+
+ PLD( pld [r1, #0] )
+ PLD( subs r2, r2, #96 )
+ PLD( pld [r1, #28] )
+ PLD( blt 13f )
+ PLD( pld [r1, #60] )
+ PLD( pld [r1, #92] )
+
+12: PLD( pld [r1, #124] )
+13: ldr4w r1, r4, r5, r6, r7, abort=19f
+ mov r3, lr, pull #\pull
+ subs r2, r2, #32
+ ldr4w r1, r8, r9, ip, lr, abort=19f
+ orr r3, r3, r4, push #\push
+ mov r4, r4, pull #\pull
+ orr r4, r4, r5, push #\push
+ mov r5, r5, pull #\pull
+ orr r5, r5, r6, push #\push
+ mov r6, r6, pull #\pull
+ orr r6, r6, r7, push #\push
+ mov r7, r7, pull #\pull
+ orr r7, r7, r8, push #\push
+ mov r8, r8, pull #\pull
+ orr r8, r8, r9, push #\push
+ mov r9, r9, pull #\pull
+ orr r9, r9, ip, push #\push
+ mov ip, ip, pull #\pull
+ orr ip, ip, lr, push #\push
+ str8w r0, r3, r4, r5, r6, r7, r8, r9, ip, , abort=19f
+ bge 12b
+ PLD( cmn r2, #96 )
+ PLD( bge 13b )
+
+ ldmfd sp!, {r5 - r9}
+
+14: ands ip, r2, #28
+ beq 16f
+
+15: mov r3, lr, pull #\pull
+ ldr1w r1, lr, abort=21f
+ subs ip, ip, #4
+ orr r3, r3, lr, push #\push
+ str1w r0, r3, abort=21f
+ bgt 15b
+ CALGN( cmp r2, #0 )
+ CALGN( bge 11b )
+
+16: sub r1, r1, #(\push / 8)
+ b 8b
+
+ .endm
+
+
+ forward_copy_shift pull=8 push=24
+
+17: forward_copy_shift pull=16 push=16
+
+18: forward_copy_shift pull=24 push=8
+
+
+/*
+ * Abort preamble and completion macros.
+ * If a fixup handler is required then those macros must surround it.
+ * It is assumed that the fixup code will handle the private part of
+ * the exit macro.
+ */
+
+ .macro copy_abort_preamble
+19: ldmfd sp!, {r5 - r9}
+ b 21f
+20: ldmfd sp!, {r5 - r8}
+21:
+ .endm
+
+ .macro copy_abort_end
+ ldmfd sp!, {r4, pc}
+ .endm
+
diff --git a/xen/arch/arm/lib/div64.S b/xen/arch/arm/lib/div64.S
new file mode 100644
index 0000000..2584772
--- /dev/null
+++ b/xen/arch/arm/lib/div64.S
@@ -0,0 +1,149 @@
+/*
+ * linux/arch/arm/lib/div64.S
+ *
+ * Optimized computation of 64-bit dividend / 32-bit divisor
+ *
+ * Author: Nicolas Pitre
+ * Created: Oct 5, 2003
+ * Copyright: Monta Vista Software, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <xen/config.h>
+#include "assembler.h"
+
+#define xl r0
+#define xh r1
+#define yl r2
+#define yh r3
+
+/*
+ * __do_div64: perform a division with 64-bit dividend and 32-bit divisor.
+ *
+ * Note: Calling convention is totally non standard for optimal code.
+ * This is meant to be used by do_div() from include/asm/div64.h only.
+ *
+ * Input parameters:
+ * xh-xl = dividend (clobbered)
+ * r4 = divisor (preserved)
+ *
+ * Output values:
+ * yh-yl = result
+ * xh = remainder
+ *
+ * Clobbered regs: xl, ip
+ */
+
+ENTRY(__do_div64)
+
+ @ Test for easy paths first.
+ subs ip, r4, #1
+ bls 9f @ divisor is 0 or 1
+ tst ip, r4
+ beq 8f @ divisor is power of 2
+
+ @ See if we need to handle upper 32-bit result.
+ cmp xh, r4
+ mov yh, #0
+ blo 3f
+
+ @ Align divisor with upper part of dividend.
+ @ The aligned divisor is stored in yl preserving the original.
+ @ The bit position is stored in ip.
+
+ clz yl, r4
+ clz ip, xh
+ sub yl, yl, ip
+ mov ip, #1
+ mov ip, ip, lsl yl
+ mov yl, r4, lsl yl
+
+ @ The division loop for needed upper bit positions.
+ @ Break out early if dividend reaches 0.
+2: cmp xh, yl
+ orrcs yh, yh, ip
+ subcss xh, xh, yl
+ movnes ip, ip, lsr #1
+ mov yl, yl, lsr #1
+ bne 2b
+
+ @ See if we need to handle lower 32-bit result.
+3: cmp xh, #0
+ mov yl, #0
+ cmpeq xl, r4
+ movlo xh, xl
+ movlo pc, lr
+
+ @ The division loop for lower bit positions.
+ @ Here we shift remainer bits leftwards rather than moving the
+ @ divisor for comparisons, considering the carry-out bit as well.
+ mov ip, #0x80000000
+4: movs xl, xl, lsl #1
+ adcs xh, xh, xh
+ beq 6f
+ cmpcc xh, r4
+5: orrcs yl, yl, ip
+ subcs xh, xh, r4
+ movs ip, ip, lsr #1
+ bne 4b
+ mov pc, lr
+
+ @ The top part of remainder became zero. If carry is set
+ @ (the 33th bit) this is a false positive so resume the loop.
+ @ Otherwise, if lower part is also null then we are done.
+6: bcs 5b
+ cmp xl, #0
+ moveq pc, lr
+
+ @ We still have remainer bits in the low part. Bring them up.
+
+ clz xh, xl @ we know xh is zero here so...
+ add xh, xh, #1
+ mov xl, xl, lsl xh
+ mov ip, ip, lsr xh
+
+ @ Current remainder is now 1. It is worthless to compare with
+ @ divisor at this point since divisor can not be smaller than 3 here.
+ @ If possible, branch for another shift in the division loop.
+ @ If no bit position left then we are done.
+ movs ip, ip, lsr #1
+ mov xh, #1
+ bne 4b
+ mov pc, lr
+
+8: @ Division by a power of 2: determine what that divisor order is
+ @ then simply shift values around
+
+ clz ip, r4
+ rsb ip, ip, #31
+
+ mov yh, xh, lsr ip
+ mov yl, xl, lsr ip
+ rsb ip, ip, #32
+ ARM( orr yl, yl, xh, lsl ip )
+ THUMB( lsl xh, xh, ip )
+ THUMB( orr yl, yl, xh )
+ mov xh, xl, lsl ip
+ mov xh, xh, lsr ip
+ mov pc, lr
+
+ @ eq -> division by 1: obvious enough...
+9: moveq yl, xl
+ moveq yh, xh
+ moveq xh, #0
+ moveq pc, lr
+
+ @ Division by 0:
+ str lr, [sp, #-8]!
+ bl __div0
+
+ @ as wrong as it could be...
+ mov yl, #0
+ mov yh, #0
+ mov xh, #0
+ ldr pc, [sp], #8
+
+ENDPROC(__do_div64)
diff --git a/xen/arch/arm/lib/findbit.S b/xen/arch/arm/lib/findbit.S
new file mode 100644
index 0000000..5669b91
--- /dev/null
+++ b/xen/arch/arm/lib/findbit.S
@@ -0,0 +1,115 @@
+/*
+ * linux/arch/arm/lib/findbit.S
+ *
+ * Copyright (C) 1995-2000 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * 16th March 2001 - John Ripley <jripley@sonicblue.com>
+ * Fixed so that "size" is an exclusive not an inclusive quantity.
+ * All users of these functions expect exclusive sizes, and may
+ * also call with zero size.
+ * Reworked by rmk.
+ */
+
+#include <xen/config.h>
+
+#include "assembler.h"
+ .text
+
+/*
+ * Purpose : Find a 'zero' bit
+ * Prototype: int find_first_zero_bit(void *addr, unsigned int maxbit);
+ */
+ENTRY(_find_first_zero_bit)
+ teq r1, #0
+ beq 3f
+ mov r2, #0
+1:
+ ARM( ldrb r3, [r0, r2, lsr #3] )
+ THUMB( lsr r3, r2, #3 )
+ THUMB( ldrb r3, [r0, r3] )
+ eors r3, r3, #0xff @ invert bits
+ bne .L_found @ any now set - found zero bit
+ add r2, r2, #8 @ next bit pointer
+2: cmp r2, r1 @ any more?
+ blo 1b
+3: mov r0, r1 @ no free bits
+ mov pc, lr
+ENDPROC(_find_first_zero_bit)
+
+/*
+ * Purpose : Find next 'zero' bit
+ * Prototype: int find_next_zero_bit(void *addr, unsigned int maxbit, int offset)
+ */
+ENTRY(_find_next_zero_bit)
+ teq r1, #0
+ beq 3b
+ ands ip, r2, #7
+ beq 1b @ If new byte, goto old routine
+ ARM( ldrb r3, [r0, r2, lsr #3] )
+ THUMB( lsr r3, r2, #3 )
+ THUMB( ldrb r3, [r0, r3] )
+ eor r3, r3, #0xff @ now looking for a 1 bit
+ movs r3, r3, lsr ip @ shift off unused bits
+ bne .L_found
+ orr r2, r2, #7 @ if zero, then no bits here
+ add r2, r2, #1 @ align bit pointer
+ b 2b @ loop for next bit
+ENDPROC(_find_next_zero_bit)
+
+/*
+ * Purpose : Find a 'one' bit
+ * Prototype: int find_first_bit(const unsigned long *addr, unsigned int maxbit);
+ */
+ENTRY(_find_first_bit)
+ teq r1, #0
+ beq 3f
+ mov r2, #0
+1:
+ ARM( ldrb r3, [r0, r2, lsr #3] )
+ THUMB( lsr r3, r2, #3 )
+ THUMB( ldrb r3, [r0, r3] )
+ movs r3, r3
+ bne .L_found @ any now set - found zero bit
+ add r2, r2, #8 @ next bit pointer
+2: cmp r2, r1 @ any more?
+ blo 1b
+3: mov r0, r1 @ no free bits
+ mov pc, lr
+ENDPROC(_find_first_bit)
+
+/*
+ * Purpose : Find next 'one' bit
+ * Prototype: int find_next_zero_bit(void *addr, unsigned int maxbit, int offset)
+ */
+ENTRY(_find_next_bit)
+ teq r1, #0
+ beq 3b
+ ands ip, r2, #7
+ beq 1b @ If new byte, goto old routine
+ ARM( ldrb r3, [r0, r2, lsr #3] )
+ THUMB( lsr r3, r2, #3 )
+ THUMB( ldrb r3, [r0, r3] )
+ movs r3, r3, lsr ip @ shift off unused bits
+ bne .L_found
+ orr r2, r2, #7 @ if zero, then no bits here
+ add r2, r2, #1 @ align bit pointer
+ b 2b @ loop for next bit
+ENDPROC(_find_next_bit)
+
+/*
+ * One or more bits in the LSB of r3 are assumed to be set.
+ */
+.L_found:
+ rsb r0, r3, #0
+ and r3, r3, r0
+ clz r3, r3
+ rsb r3, r3, #31
+ add r0, r2, r3
+ cmp r1, r0 @ Clamp to maxbit
+ movlo r0, r1
+ mov pc, lr
+
diff --git a/xen/arch/arm/lib/lib1funcs.S b/xen/arch/arm/lib/lib1funcs.S
new file mode 100644
index 0000000..745f4af
--- /dev/null
+++ b/xen/arch/arm/lib/lib1funcs.S
@@ -0,0 +1,274 @@
+/*
+ * linux/arch/arm/lib/lib1funcs.S: Optimized ARM division routines
+ *
+ * Author: Nicolas Pitre <nico@fluxnic.net>
+ * - contributed to gcc-3.4 on Sep 30, 2003
+ * - adapted for the Linux kernel on Oct 2, 2003
+ */
+
+/* Copyright 1995, 1996, 1998, 1999, 2000, 2003 Free Software Foundation, Inc.
+
+This file is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation; either version 2, or (at your option) any
+later version.
+
+In addition to the permissions in the GNU General Public License, the
+Free Software Foundation gives you unlimited permission to link the
+compiled version of this file into combinations with other programs,
+and to distribute those combinations without any restriction coming
+from the use of this file. (The General Public License restrictions
+do apply in other respects; for example, they cover modification of
+the file, and distribution when not linked into a combine
+executable.)
+
+This file is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; see the file COPYING. If not, write to
+the Free Software Foundation, 59 Temple Place - Suite 330,
+Boston, MA 02111-1307, USA. */
+
+
+#include <xen/config.h>
+#include "assembler.h"
+
+.macro ARM_DIV_BODY dividend, divisor, result, curbit
+
+ clz \curbit, \divisor
+ clz \result, \dividend
+ sub \result, \curbit, \result
+ mov \curbit, #1
+ mov \divisor, \divisor, lsl \result
+ mov \curbit, \curbit, lsl \result
+ mov \result, #0
+
+ @ Division loop
+1: cmp \dividend, \divisor
+ subhs \dividend, \dividend, \divisor
+ orrhs \result, \result, \curbit
+ cmp \dividend, \divisor, lsr #1
+ subhs \dividend, \dividend, \divisor, lsr #1
+ orrhs \result, \result, \curbit, lsr #1
+ cmp \dividend, \divisor, lsr #2
+ subhs \dividend, \dividend, \divisor, lsr #2
+ orrhs \result, \result, \curbit, lsr #2
+ cmp \dividend, \divisor, lsr #3
+ subhs \dividend, \dividend, \divisor, lsr #3
+ orrhs \result, \result, \curbit, lsr #3
+ cmp \dividend, #0 @ Early termination?
+ movnes \curbit, \curbit, lsr #4 @ No, any more bits to do?
+ movne \divisor, \divisor, lsr #4
+ bne 1b
+
+.endm
+
+
+.macro ARM_DIV2_ORDER divisor, order
+
+ clz \order, \divisor
+ rsb \order, \order, #31
+
+.endm
+
+
+.macro ARM_MOD_BODY dividend, divisor, order, spare
+
+ clz \order, \divisor
+ clz \spare, \dividend
+ sub \order, \order, \spare
+ mov \divisor, \divisor, lsl \order
+
+ @ Perform all needed substractions to keep only the reminder.
+ @ Do comparisons in batch of 4 first.
+ subs \order, \order, #3 @ yes, 3 is intended here
+ blt 2f
+
+1: cmp \dividend, \divisor
+ subhs \dividend, \dividend, \divisor
+ cmp \dividend, \divisor, lsr #1
+ subhs \dividend, \dividend, \divisor, lsr #1
+ cmp \dividend, \divisor, lsr #2
+ subhs \dividend, \dividend, \divisor, lsr #2
+ cmp \dividend, \divisor, lsr #3
+ subhs \dividend, \dividend, \divisor, lsr #3
+ cmp \dividend, #1
+ mov \divisor, \divisor, lsr #4
+ subges \order, \order, #4
+ bge 1b
+
+ tst \order, #3
+ teqne \dividend, #0
+ beq 5f
+
+ @ Either 1, 2 or 3 comparison/substractions are left.
+2: cmn \order, #2
+ blt 4f
+ beq 3f
+ cmp \dividend, \divisor
+ subhs \dividend, \dividend, \divisor
+ mov \divisor, \divisor, lsr #1
+3: cmp \dividend, \divisor
+ subhs \dividend, \dividend, \divisor
+ mov \divisor, \divisor, lsr #1
+4: cmp \dividend, \divisor
+ subhs \dividend, \dividend, \divisor
+5:
+.endm
+
+
+ENTRY(__udivsi3)
+ENTRY(__aeabi_uidiv)
+UNWIND(.fnstart)
+
+ subs r2, r1, #1
+ moveq pc, lr
+ bcc Ldiv0
+ cmp r0, r1
+ bls 11f
+ tst r1, r2
+ beq 12f
+
+ ARM_DIV_BODY r0, r1, r2, r3
+
+ mov r0, r2
+ mov pc, lr
+
+11: moveq r0, #1
+ movne r0, #0
+ mov pc, lr
+
+12: ARM_DIV2_ORDER r1, r2
+
+ mov r0, r0, lsr r2
+ mov pc, lr
+
+UNWIND(.fnend)
+ENDPROC(__udivsi3)
+ENDPROC(__aeabi_uidiv)
+
+ENTRY(__umodsi3)
+UNWIND(.fnstart)
+
+ subs r2, r1, #1 @ compare divisor with 1
+ bcc Ldiv0
+ cmpne r0, r1 @ compare dividend with divisor
+ moveq r0, #0
+ tsthi r1, r2 @ see if divisor is power of 2
+ andeq r0, r0, r2
+ movls pc, lr
+
+ ARM_MOD_BODY r0, r1, r2, r3
+
+ mov pc, lr
+
+UNWIND(.fnend)
+ENDPROC(__umodsi3)
+
+ENTRY(__divsi3)
+ENTRY(__aeabi_idiv)
+UNWIND(.fnstart)
+
+ cmp r1, #0
+ eor ip, r0, r1 @ save the sign of the result.
+ beq Ldiv0
+ rsbmi r1, r1, #0 @ loops below use unsigned.
+ subs r2, r1, #1 @ division by 1 or -1 ?
+ beq 10f
+ movs r3, r0
+ rsbmi r3, r0, #0 @ positive dividend value
+ cmp r3, r1
+ bls 11f
+ tst r1, r2 @ divisor is power of 2 ?
+ beq 12f
+
+ ARM_DIV_BODY r3, r1, r0, r2
+
+ cmp ip, #0
+ rsbmi r0, r0, #0
+ mov pc, lr
+
+10: teq ip, r0 @ same sign ?
+ rsbmi r0, r0, #0
+ mov pc, lr
+
+11: movlo r0, #0
+ moveq r0, ip, asr #31
+ orreq r0, r0, #1
+ mov pc, lr
+
+12: ARM_DIV2_ORDER r1, r2
+
+ cmp ip, #0
+ mov r0, r3, lsr r2
+ rsbmi r0, r0, #0
+ mov pc, lr
+
+UNWIND(.fnend)
+ENDPROC(__divsi3)
+ENDPROC(__aeabi_idiv)
+
+ENTRY(__modsi3)
+UNWIND(.fnstart)
+
+ cmp r1, #0
+ beq Ldiv0
+ rsbmi r1, r1, #0 @ loops below use unsigned.
+ movs ip, r0 @ preserve sign of dividend
+ rsbmi r0, r0, #0 @ if negative make positive
+ subs r2, r1, #1 @ compare divisor with 1
+ cmpne r0, r1 @ compare dividend with divisor
+ moveq r0, #0
+ tsthi r1, r2 @ see if divisor is power of 2
+ andeq r0, r0, r2
+ bls 10f
+
+ ARM_MOD_BODY r0, r1, r2, r3
+
+10: cmp ip, #0
+ rsbmi r0, r0, #0
+ mov pc, lr
+
+UNWIND(.fnend)
+ENDPROC(__modsi3)
+
+ENTRY(__aeabi_uidivmod)
+UNWIND(.fnstart)
+UNWIND(.save {r0, r1, ip, lr} )
+
+ stmfd sp!, {r0, r1, ip, lr}
+ bl __aeabi_uidiv
+ ldmfd sp!, {r1, r2, ip, lr}
+ mul r3, r0, r2
+ sub r1, r1, r3
+ mov pc, lr
+
+UNWIND(.fnend)
+ENDPROC(__aeabi_uidivmod)
+
+ENTRY(__aeabi_idivmod)
+UNWIND(.fnstart)
+UNWIND(.save {r0, r1, ip, lr} )
+ stmfd sp!, {r0, r1, ip, lr}
+ bl __aeabi_idiv
+ ldmfd sp!, {r1, r2, ip, lr}
+ mul r3, r0, r2
+ sub r1, r1, r3
+ mov pc, lr
+
+UNWIND(.fnend)
+ENDPROC(__aeabi_idivmod)
+
+Ldiv0:
+UNWIND(.fnstart)
+UNWIND(.pad #4)
+UNWIND(.save {lr})
+ str lr, [sp, #-8]!
+ bl __div0
+ mov r0, #0 @ About as wrong as it could be.
+ ldr pc, [sp], #8
+UNWIND(.fnend)
+ENDPROC(Ldiv0)
diff --git a/xen/arch/arm/lib/memcpy.S b/xen/arch/arm/lib/memcpy.S
new file mode 100644
index 0000000..f4bad5c
--- /dev/null
+++ b/xen/arch/arm/lib/memcpy.S
@@ -0,0 +1,64 @@
+/*
+ * linux/arch/arm/lib/memcpy.S
+ *
+ * Author: Nicolas Pitre
+ * Created: Sep 28, 2005
+ * Copyright: MontaVista Software, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <xen/config.h>
+
+#include "assembler.h"
+
+#define LDR1W_SHIFT 0
+#define STR1W_SHIFT 0
+
+ .macro ldr1w ptr reg abort
+ W(ldr) \reg, [\ptr], #4
+ .endm
+
+ .macro ldr4w ptr reg1 reg2 reg3 reg4 abort
+ ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4}
+ .endm
+
+ .macro ldr8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
+ ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8}
+ .endm
+
+ .macro ldr1b ptr reg cond=al abort
+ ldr\cond\()b \reg, [\ptr], #1
+ .endm
+
+ .macro str1w ptr reg abort
+ W(str) \reg, [\ptr], #4
+ .endm
+
+ .macro str8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
+ stmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8}
+ .endm
+
+ .macro str1b ptr reg cond=al abort
+ str\cond\()b \reg, [\ptr], #1
+ .endm
+
+ .macro enter reg1 reg2
+ stmdb sp!, {r0, \reg1, \reg2}
+ .endm
+
+ .macro exit reg1 reg2
+ ldmfd sp!, {r0, \reg1, \reg2}
+ .endm
+
+ .text
+
+/* Prototype: void *memcpy(void *dest, const void *src, size_t n); */
+
+ENTRY(memcpy)
+
+#include "copy_template.S"
+
+ENDPROC(memcpy)
diff --git a/xen/arch/arm/lib/memmove.S b/xen/arch/arm/lib/memmove.S
new file mode 100644
index 0000000..4e142b8
--- /dev/null
+++ b/xen/arch/arm/lib/memmove.S
@@ -0,0 +1,200 @@
+/*
+ * linux/arch/arm/lib/memmove.S
+ *
+ * Author: Nicolas Pitre
+ * Created: Sep 28, 2005
+ * Copyright: (C) MontaVista Software Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <xen/config.h>
+
+#include "assembler.h"
+
+ .text
+
+/*
+ * Prototype: void *memmove(void *dest, const void *src, size_t n);
+ *
+ * Note:
+ *
+ * If the memory regions don't overlap, we simply branch to memcpy which is
+ * normally a bit faster. Otherwise the copy is done going downwards. This
+ * is a transposition of the code from copy_template.S but with the copy
+ * occurring in the opposite direction.
+ */
+
+ENTRY(memmove)
+
+ subs ip, r0, r1
+ cmphi r2, ip
+ bls memcpy
+
+ stmfd sp!, {r0, r4, lr}
+ add r1, r1, r2
+ add r0, r0, r2
+ subs r2, r2, #4
+ blt 8f
+ ands ip, r0, #3
+ PLD( pld [r1, #-4] )
+ bne 9f
+ ands ip, r1, #3
+ bne 10f
+
+1: subs r2, r2, #(28)
+ stmfd sp!, {r5 - r8}
+ blt 5f
+
+ CALGN( ands ip, r0, #31 )
+ CALGN( sbcnes r4, ip, r2 ) @ C is always set here
+ CALGN( bcs 2f )
+ CALGN( adr r4, 6f )
+ CALGN( subs r2, r2, ip ) @ C is set here
+ CALGN( rsb ip, ip, #32 )
+ CALGN( add pc, r4, ip )
+
+ PLD( pld [r1, #-4] )
+2: PLD( subs r2, r2, #96 )
+ PLD( pld [r1, #-32] )
+ PLD( blt 4f )
+ PLD( pld [r1, #-64] )
+ PLD( pld [r1, #-96] )
+
+3: PLD( pld [r1, #-128] )
+4: ldmdb r1!, {r3, r4, r5, r6, r7, r8, ip, lr}
+ subs r2, r2, #32
+ stmdb r0!, {r3, r4, r5, r6, r7, r8, ip, lr}
+ bge 3b
+ PLD( cmn r2, #96 )
+ PLD( bge 4b )
+
+5: ands ip, r2, #28
+ rsb ip, ip, #32
+ addne pc, pc, ip @ C is always clear here
+ b 7f
+6: W(nop)
+ W(ldr) r3, [r1, #-4]!
+ W(ldr) r4, [r1, #-4]!
+ W(ldr) r5, [r1, #-4]!
+ W(ldr) r6, [r1, #-4]!
+ W(ldr) r7, [r1, #-4]!
+ W(ldr) r8, [r1, #-4]!
+ W(ldr) lr, [r1, #-4]!
+
+ add pc, pc, ip
+ nop
+ W(nop)
+ W(str) r3, [r0, #-4]!
+ W(str) r4, [r0, #-4]!
+ W(str) r5, [r0, #-4]!
+ W(str) r6, [r0, #-4]!
+ W(str) r7, [r0, #-4]!
+ W(str) r8, [r0, #-4]!
+ W(str) lr, [r0, #-4]!
+
+ CALGN( bcs 2b )
+
+7: ldmfd sp!, {r5 - r8}
+
+8: movs r2, r2, lsl #31
+ ldrneb r3, [r1, #-1]!
+ ldrcsb r4, [r1, #-1]!
+ ldrcsb ip, [r1, #-1]
+ strneb r3, [r0, #-1]!
+ strcsb r4, [r0, #-1]!
+ strcsb ip, [r0, #-1]
+ ldmfd sp!, {r0, r4, pc}
+
+9: cmp ip, #2
+ ldrgtb r3, [r1, #-1]!
+ ldrgeb r4, [r1, #-1]!
+ ldrb lr, [r1, #-1]!
+ strgtb r3, [r0, #-1]!
+ strgeb r4, [r0, #-1]!
+ subs r2, r2, ip
+ strb lr, [r0, #-1]!
+ blt 8b
+ ands ip, r1, #3
+ beq 1b
+
+10: bic r1, r1, #3
+ cmp ip, #2
+ ldr r3, [r1, #0]
+ beq 17f
+ blt 18f
+
+
+ .macro backward_copy_shift push pull
+
+ subs r2, r2, #28
+ blt 14f
+
+ CALGN( ands ip, r0, #31 )
+ CALGN( sbcnes r4, ip, r2 ) @ C is always set here
+ CALGN( subcc r2, r2, ip )
+ CALGN( bcc 15f )
+
+11: stmfd sp!, {r5 - r9}
+
+ PLD( pld [r1, #-4] )
+ PLD( subs r2, r2, #96 )
+ PLD( pld [r1, #-32] )
+ PLD( blt 13f )
+ PLD( pld [r1, #-64] )
+ PLD( pld [r1, #-96] )
+
+12: PLD( pld [r1, #-128] )
+13: ldmdb r1!, {r7, r8, r9, ip}
+ mov lr, r3, push #\push
+ subs r2, r2, #32
+ ldmdb r1!, {r3, r4, r5, r6}
+ orr lr, lr, ip, pull #\pull
+ mov ip, ip, push #\push
+ orr ip, ip, r9, pull #\pull
+ mov r9, r9, push #\push
+ orr r9, r9, r8, pull #\pull
+ mov r8, r8, push #\push
+ orr r8, r8, r7, pull #\pull
+ mov r7, r7, push #\push
+ orr r7, r7, r6, pull #\pull
+ mov r6, r6, push #\push
+ orr r6, r6, r5, pull #\pull
+ mov r5, r5, push #\push
+ orr r5, r5, r4, pull #\pull
+ mov r4, r4, push #\push
+ orr r4, r4, r3, pull #\pull
+ stmdb r0!, {r4 - r9, ip, lr}
+ bge 12b
+ PLD( cmn r2, #96 )
+ PLD( bge 13b )
+
+ ldmfd sp!, {r5 - r9}
+
+14: ands ip, r2, #28
+ beq 16f
+
+15: mov lr, r3, push #\push
+ ldr r3, [r1, #-4]!
+ subs ip, ip, #4
+ orr lr, lr, r3, pull #\pull
+ str lr, [r0, #-4]!
+ bgt 15b
+ CALGN( cmp r2, #0 )
+ CALGN( bge 11b )
+
+16: add r1, r1, #(\pull / 8)
+ b 8b
+
+ .endm
+
+
+ backward_copy_shift push=8 pull=24
+
+17: backward_copy_shift push=16 pull=16
+
+18: backward_copy_shift push=24 pull=8
+
+ENDPROC(memmove)
diff --git a/xen/arch/arm/lib/memset.S b/xen/arch/arm/lib/memset.S
new file mode 100644
index 0000000..d2937a3
--- /dev/null
+++ b/xen/arch/arm/lib/memset.S
@@ -0,0 +1,129 @@
+/*
+ * linux/arch/arm/lib/memset.S
+ *
+ * Copyright (C) 1995-2000 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * ASM optimised string functions
+ */
+
+#include <xen/config.h>
+
+#include "assembler.h"
+
+ .text
+ .align 5
+ .word 0
+
+1: subs r2, r2, #4 @ 1 do we have enough
+ blt 5f @ 1 bytes to align with?
+ cmp r3, #2 @ 1
+ strltb r1, [r0], #1 @ 1
+ strleb r1, [r0], #1 @ 1
+ strb r1, [r0], #1 @ 1
+ add r2, r2, r3 @ 1 (r2 = r2 - (4 - r3))
+/*
+ * The pointer is now aligned and the length is adjusted. Try doing the
+ * memset again.
+ */
+
+ENTRY(memset)
+ ands r3, r0, #3 @ 1 unaligned?
+ bne 1b @ 1
+/*
+ * we know that the pointer in r0 is aligned to a word boundary.
+ */
+ orr r1, r1, r1, lsl #8
+ orr r1, r1, r1, lsl #16
+ mov r3, r1
+ cmp r2, #16
+ blt 4f
+
+#if ! CALGN(1)+0
+
+/*
+ * We need an extra register for this loop - save the return address and
+ * use the LR
+ */
+ str lr, [sp, #-4]!
+ mov ip, r1
+ mov lr, r1
+
+2: subs r2, r2, #64
+ stmgeia r0!, {r1, r3, ip, lr} @ 64 bytes at a time.
+ stmgeia r0!, {r1, r3, ip, lr}
+ stmgeia r0!, {r1, r3, ip, lr}
+ stmgeia r0!, {r1, r3, ip, lr}
+ bgt 2b
+ ldmeqfd sp!, {pc} @ Now <64 bytes to go.
+/*
+ * No need to correct the count; we're only testing bits from now on
+ */
+ tst r2, #32
+ stmneia r0!, {r1, r3, ip, lr}
+ stmneia r0!, {r1, r3, ip, lr}
+ tst r2, #16
+ stmneia r0!, {r1, r3, ip, lr}
+ ldr lr, [sp], #4
+
+#else
+
+/*
+ * This version aligns the destination pointer in order to write
+ * whole cache lines at once.
+ */
+
+ stmfd sp!, {r4-r7, lr}
+ mov r4, r1
+ mov r5, r1
+ mov r6, r1
+ mov r7, r1
+ mov ip, r1
+ mov lr, r1
+
+ cmp r2, #96
+ tstgt r0, #31
+ ble 3f
+
+ and ip, r0, #31
+ rsb ip, ip, #32
+ sub r2, r2, ip
+ movs ip, ip, lsl #(32 - 4)
+ stmcsia r0!, {r4, r5, r6, r7}
+ stmmiia r0!, {r4, r5}
+ tst ip, #(1 << 30)
+ mov ip, r1
+ strne r1, [r0], #4
+
+3: subs r2, r2, #64
+ stmgeia r0!, {r1, r3-r7, ip, lr}
+ stmgeia r0!, {r1, r3-r7, ip, lr}
+ bgt 3b
+ ldmeqfd sp!, {r4-r7, pc}
+
+ tst r2, #32
+ stmneia r0!, {r1, r3-r7, ip, lr}
+ tst r2, #16
+ stmneia r0!, {r4-r7}
+ ldmfd sp!, {r4-r7, lr}
+
+#endif
+
+4: tst r2, #8
+ stmneia r0!, {r1, r3}
+ tst r2, #4
+ strne r1, [r0], #4
+/*
+ * When we get here, we've got less than 4 bytes to zero. We
+ * may have an unaligned pointer as well.
+ */
+5: tst r2, #2
+ strneb r1, [r0], #1
+ strneb r1, [r0], #1
+ tst r2, #1
+ strneb r1, [r0], #1
+ mov pc, lr
+ENDPROC(memset)
diff --git a/xen/arch/arm/lib/memzero.S b/xen/arch/arm/lib/memzero.S
new file mode 100644
index 0000000..ce25aca
--- /dev/null
+++ b/xen/arch/arm/lib/memzero.S
@@ -0,0 +1,127 @@
+/*
+ * linux/arch/arm/lib/memzero.S
+ *
+ * Copyright (C) 1995-2000 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <xen/config.h>
+
+#include "assembler.h"
+
+ .text
+ .align 5
+ .word 0
+/*
+ * Align the pointer in r0. r3 contains the number of bytes that we are
+ * mis-aligned by, and r1 is the number of bytes. If r1 < 4, then we
+ * don't bother; we use byte stores instead.
+ */
+1: subs r1, r1, #4 @ 1 do we have enough
+ blt 5f @ 1 bytes to align with?
+ cmp r3, #2 @ 1
+ strltb r2, [r0], #1 @ 1
+ strleb r2, [r0], #1 @ 1
+ strb r2, [r0], #1 @ 1
+ add r1, r1, r3 @ 1 (r1 = r1 - (4 - r3))
+/*
+ * The pointer is now aligned and the length is adjusted. Try doing the
+ * memzero again.
+ */
+
+ENTRY(__memzero)
+ mov r2, #0 @ 1
+ ands r3, r0, #3 @ 1 unaligned?
+ bne 1b @ 1
+/*
+ * r3 = 0, and we know that the pointer in r0 is aligned to a word boundary.
+ */
+ cmp r1, #16 @ 1 we can skip this chunk if we
+ blt 4f @ 1 have < 16 bytes
+
+#if ! CALGN(1)+0
+
+/*
+ * We need an extra register for this loop - save the return address and
+ * use the LR
+ */
+ str lr, [sp, #-4]! @ 1
+ mov ip, r2 @ 1
+ mov lr, r2 @ 1
+
+3: subs r1, r1, #64 @ 1 write 32 bytes out per loop
+ stmgeia r0!, {r2, r3, ip, lr} @ 4
+ stmgeia r0!, {r2, r3, ip, lr} @ 4
+ stmgeia r0!, {r2, r3, ip, lr} @ 4
+ stmgeia r0!, {r2, r3, ip, lr} @ 4
+ bgt 3b @ 1
+ ldmeqfd sp!, {pc} @ 1/2 quick exit
+/*
+ * No need to correct the count; we're only testing bits from now on
+ */
+ tst r1, #32 @ 1
+ stmneia r0!, {r2, r3, ip, lr} @ 4
+ stmneia r0!, {r2, r3, ip, lr} @ 4
+ tst r1, #16 @ 1 16 bytes or more?
+ stmneia r0!, {r2, r3, ip, lr} @ 4
+ ldr lr, [sp], #4 @ 1
+
+#else
+
+/*
+ * This version aligns the destination pointer in order to write
+ * whole cache lines at once.
+ */
+
+ stmfd sp!, {r4-r7, lr}
+ mov r4, r2
+ mov r5, r2
+ mov r6, r2
+ mov r7, r2
+ mov ip, r2
+ mov lr, r2
+
+ cmp r1, #96
+ andgts ip, r0, #31
+ ble 3f
+
+ rsb ip, ip, #32
+ sub r1, r1, ip
+ movs ip, ip, lsl #(32 - 4)
+ stmcsia r0!, {r4, r5, r6, r7}
+ stmmiia r0!, {r4, r5}
+ movs ip, ip, lsl #2
+ strcs r2, [r0], #4
+
+3: subs r1, r1, #64
+ stmgeia r0!, {r2-r7, ip, lr}
+ stmgeia r0!, {r2-r7, ip, lr}
+ bgt 3b
+ ldmeqfd sp!, {r4-r7, pc}
+
+ tst r1, #32
+ stmneia r0!, {r2-r7, ip, lr}
+ tst r1, #16
+ stmneia r0!, {r4-r7}
+ ldmfd sp!, {r4-r7, lr}
+
+#endif
+
+4: tst r1, #8 @ 1 8 bytes or more?
+ stmneia r0!, {r2, r3} @ 2
+ tst r1, #4 @ 1 4 bytes or more?
+ strne r2, [r0], #4 @ 1
+/*
+ * When we get here, we've got less than 4 bytes to zero. We
+ * may have an unaligned pointer as well.
+ */
+5: tst r1, #2 @ 1 2 bytes or more?
+ strneb r2, [r0], #1 @ 1
+ strneb r2, [r0], #1 @ 1
+ tst r1, #1 @ 1 a byte left over
+ strneb r2, [r0], #1 @ 1
+ mov pc, lr @ 1
+ENDPROC(__memzero)
diff --git a/xen/arch/arm/lib/setbit.S b/xen/arch/arm/lib/setbit.S
new file mode 100644
index 0000000..c828851
--- /dev/null
+++ b/xen/arch/arm/lib/setbit.S
@@ -0,0 +1,18 @@
+/*
+ * linux/arch/arm/lib/setbit.S
+ *
+ * Copyright (C) 1995-1996 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <xen/config.h>
+
+#include "assembler.h"
+#include "bitops.h"
+ .text
+
+ENTRY(_set_bit)
+ bitop orr
+ENDPROC(_set_bit)
diff --git a/xen/arch/arm/lib/testchangebit.S b/xen/arch/arm/lib/testchangebit.S
new file mode 100644
index 0000000..a7f527c
--- /dev/null
+++ b/xen/arch/arm/lib/testchangebit.S
@@ -0,0 +1,18 @@
+/*
+ * linux/arch/arm/lib/testchangebit.S
+ *
+ * Copyright (C) 1995-1996 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <xen/config.h>
+
+#include "assembler.h"
+#include "bitops.h"
+ .text
+
+ENTRY(_test_and_change_bit)
+ testop eor, str
+ENDPROC(_test_and_change_bit)
diff --git a/xen/arch/arm/lib/testclearbit.S b/xen/arch/arm/lib/testclearbit.S
new file mode 100644
index 0000000..8f39c72
--- /dev/null
+++ b/xen/arch/arm/lib/testclearbit.S
@@ -0,0 +1,18 @@
+/*
+ * linux/arch/arm/lib/testclearbit.S
+ *
+ * Copyright (C) 1995-1996 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <xen/config.h>
+
+#include "assembler.h"
+#include "bitops.h"
+ .text
+
+ENTRY(_test_and_clear_bit)
+ testop bicne, strne
+ENDPROC(_test_and_clear_bit)
diff --git a/xen/arch/arm/lib/testsetbit.S b/xen/arch/arm/lib/testsetbit.S
new file mode 100644
index 0000000..1b8d273
--- /dev/null
+++ b/xen/arch/arm/lib/testsetbit.S
@@ -0,0 +1,18 @@
+/*
+ * linux/arch/arm/lib/testsetbit.S
+ *
+ * Copyright (C) 1995-1996 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <xen/config.h>
+
+#include "assembler.h"
+#include "bitops.h"
+ .text
+
+ENTRY(_test_and_set_bit)
+ testop orreq, streq
+ENDPROC(_test_and_set_bit)
--
1.7.2.5

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel