[parisc-linux-cvs] linux-2.6 tausq
Randolph Chung
randolph at tausq.org
Thu Oct 7 14:58:54 MDT 2004
> Modified files:
> arch/parisc/lib: memcpy.c
>
> Log message:
> Improve copy performance for small and unaligned copies
for small buffers, all the extra branches are expensive, so we revert to
the byte-at-a-time algorithm asap. timing experiments showed that this
is the fastest method for len <= 16
for unaligned copies, the merge operation can be done with much fewer
instructions on a pa20 cpu by using the shrpw insn. in a microbenchmark,
the speedup is ~2x. the kernel actually does quite a lot of unaligned
(1-byte or 2-byte aligned) copies, so copy_dstaligned() is used quite a
bit; however the buffers being copied are usually quite small, so the
system-level impact is relatively small.
lmbench shows a modest speedup in bw_unix and bw_pipe, but the
difference (~1.5%) is probably within experimental error.
randolph
Index: arch/parisc/lib/memcpy.c
===================================================================
RCS file: /var/cvs/linux-2.6/arch/parisc/lib/memcpy.c,v
retrieving revision 1.6
retrieving revision 1.7
diff -u -p -r1.6 -r1.7
--- arch/parisc/lib/memcpy.c 4 Oct 2004 19:12:50 -0000 1.6
+++ arch/parisc/lib/memcpy.c 7 Oct 2004 20:52:54 -0000 1.7
@@ -79,7 +79,20 @@ DECLARE_PER_CPU(struct exception_data, e
#define get_user_space() (segment_eq(get_fs(), KERNEL_DS) ? 0 : mfsp(3))
#define get_kernel_space() (0)
+#ifndef CONFIG_PA20
#define MERGE(w0, sh_1, w1, sh_2) (((w0) << (sh_1)) | ((w1) >> (sh_2)))
+#else
+#define MERGE(w0, sh_1, w1, sh_2) ({ \
+ unsigned int _r; \
+ asm volatile ( \
+ "mtsar %3\n" \
+ "shrpw %1, %2, %%sar, %0\n" \
+ : "=r"(_r) \
+ : "r"(w0), "r"(w1), "r"(sh_2) \
+ ); \
+ _r; \
+})
+#endif
#define THRESHOLD 16
#ifdef DEBUG_MEMCPY
@@ -123,28 +136,31 @@ DECLARE_PER_CPU(struct exception_data, e
#define flddma(_s, _a, _t, _e) def_load_ai_insn(fldd,8,"=f",_s,_a,_t,_e)
#define fstdma(_s, _t, _a, _e) def_store_ai_insn(fstd,8,"f",_s,_a,_t,_e)
-#define ldw(_s,_o,_a,_t,_e) \
+#define def_load_insn(_insn,_tt,_s,_o,_a,_t,_e) \
__asm__ __volatile__ ( \
- "1:\tldw " #_o "(" _s ",%1), %0\n" \
+ "1:\t" #_insn " " #_o "(" _s ",%1), %0\n" \
"\t.section __ex_table,\"aw\"\n" \
"\t" EXC_WORD "\t1b\n" \
"\t" EXC_WORD "\t" #_e "\n" \
"\t.previous\n" \
- : "=r"(_t) \
+ : _tt(_t) \
: "r"(_a) \
: "r8")
-#define stw(_s,_t,_o,_a,_e) \
+#define def_store_insn(_insn,_tt,_s,_t,_o,_a,_e) \
__asm__ __volatile__ ( \
- "1:\tstw %0, " #_o "(" _s ",%1)\n" \
+ "1:\t" #_insn " %0, " #_o "(" _s ",%1)\n" \
"\t.section __ex_table,\"aw\"\n" \
"\t" EXC_WORD "\t1b\n" \
"\t" EXC_WORD "\t" #_e "\n" \
"\t.previous\n" \
: \
- : "r"(_t), "r"(_a) \
+ : _tt(_t), "r"(_a) \
: "r8")
+#define ldw(_s,_o,_a,_t,_e) def_load_insn(ldw,"=r",_s,_o,_a,_t,_e)
+#define stw(_s,_t,_o,_a,_e) def_store_insn(stw,"r",_s,_t,_o,_a,_e)
+
#ifdef CONFIG_PREFETCH
extern inline void prefetch_src(const void *addr)
{
@@ -301,8 +317,8 @@ unsigned long pa_memcpy(void *dstp, cons
/* prefetch_src((const void *)srcp); */
- if (unlikely(len == 0))
- return 0;
+ if (len < THRESHOLD)
+ goto byte_copy;
/* Check alignment */
t1 = (src ^ dst);
@@ -409,9 +425,6 @@ byte_copy:
return 0;
unaligned_copy:
- if (len < THRESHOLD)
- goto byte_copy;
-
/* possibly we are aligned on a word, but not on a double... */
if (likely(t1 & (sizeof(unsigned int)-1)) == 0) {
t2 = src & (sizeof(unsigned int) - 1);
More information about the parisc-linux-cvs
mailing list