[parisc-linux-cvs] linux-2.6 tausq

Randolph Chung randolph at tausq.org
Thu Oct 7 14:58:54 MDT 2004


> Modified files:
> 	arch/parisc/lib: memcpy.c 
> 
> Log message:
> Improve copy performance for small and unaligned copies

for small buffers, all the extra branches are expensive, so we revert to
the byte-at-a-time algorithm asap. timing experiments showed that this 
is the fastest method for len <= 16

for unaligned copies, the merge operation can be done with much fewer
instructions on a pa20 cpu by using the shrpw insn. in a microbenchmark,
the speedup is ~2x. the kernel actually does quite a lot of unaligned
(1-byte or 2-byte aligned) copies, so copy_dstaligned() is used quite a
bit; however the buffers being copied are usually quite small, so the 
system-level impact is relatively small.

lmbench shows a modest speedup in bw_unix and bw_pipe, but the 
difference (~1.5%) is probably within experimental error.

randolph

Index: arch/parisc/lib/memcpy.c
===================================================================
RCS file: /var/cvs/linux-2.6/arch/parisc/lib/memcpy.c,v
retrieving revision 1.6
retrieving revision 1.7
diff -u -p -r1.6 -r1.7
--- arch/parisc/lib/memcpy.c	4 Oct 2004 19:12:50 -0000	1.6
+++ arch/parisc/lib/memcpy.c	7 Oct 2004 20:52:54 -0000	1.7
@@ -79,7 +79,20 @@ DECLARE_PER_CPU(struct exception_data, e
 #define get_user_space() (segment_eq(get_fs(), KERNEL_DS) ? 0 : mfsp(3))
 #define get_kernel_space() (0)
 
+#ifndef CONFIG_PA20
 #define MERGE(w0, sh_1, w1, sh_2) (((w0) << (sh_1)) | ((w1) >> (sh_2)))
+#else
+#define MERGE(w0, sh_1, w1, sh_2)  ({					\
+	unsigned int _r;						\
+	asm volatile (							\
+	"mtsar %3\n"							\
+	"shrpw %1, %2, %%sar, %0\n"					\
+	: "=r"(_r)							\
+	: "r"(w0), "r"(w1), "r"(sh_2)					\
+	);								\
+	_r;								\
+})
+#endif
 #define THRESHOLD	16
 
 #ifdef DEBUG_MEMCPY
@@ -123,28 +136,31 @@ DECLARE_PER_CPU(struct exception_data, e
 #define flddma(_s, _a, _t, _e) def_load_ai_insn(fldd,8,"=f",_s,_a,_t,_e)
 #define fstdma(_s, _t, _a, _e) def_store_ai_insn(fstd,8,"f",_s,_a,_t,_e)
 
-#define ldw(_s,_o,_a,_t,_e) 				\
+#define def_load_insn(_insn,_tt,_s,_o,_a,_t,_e) 	\
 	__asm__ __volatile__ (				\
-	"1:\tldw " #_o "(" _s ",%1), %0\n"		\
+	"1:\t" #_insn " " #_o "(" _s ",%1), %0\n"	\
 	"\t.section __ex_table,\"aw\"\n"		\
 	"\t" EXC_WORD "\t1b\n"				\
 	"\t" EXC_WORD "\t" #_e "\n"			\
 	"\t.previous\n"					\
-	: "=r"(_t) 					\
+	: _tt(_t) 					\
 	: "r"(_a)					\
 	: "r8")
 
-#define stw(_s,_t,_o,_a,_e) 				\
+#define def_store_insn(_insn,_tt,_s,_t,_o,_a,_e) 	\
 	__asm__ __volatile__ (				\
-	"1:\tstw %0, " #_o "(" _s ",%1)\n" 		\
+	"1:\t" #_insn " %0, " #_o "(" _s ",%1)\n" 	\
 	"\t.section __ex_table,\"aw\"\n"		\
 	"\t" EXC_WORD "\t1b\n"				\
 	"\t" EXC_WORD "\t" #_e "\n"			\
 	"\t.previous\n"					\
 	: 						\
-	: "r"(_t), "r"(_a)				\
+	: _tt(_t), "r"(_a)				\
 	: "r8")
 
+#define ldw(_s,_o,_a,_t,_e)	def_load_insn(ldw,"=r",_s,_o,_a,_t,_e)
+#define stw(_s,_t,_o,_a,_e) 	def_store_insn(stw,"r",_s,_t,_o,_a,_e)
+
 #ifdef  CONFIG_PREFETCH
 extern inline void prefetch_src(const void *addr)
 {
@@ -301,8 +317,8 @@ unsigned long pa_memcpy(void *dstp, cons
 
 	/* prefetch_src((const void *)srcp); */
 
-	if (unlikely(len == 0))
-		return 0;
+	if (len < THRESHOLD)
+		goto byte_copy;
 
 	/* Check alignment */
 	t1 = (src ^ dst);
@@ -409,9 +425,6 @@ byte_copy:
 	return 0;
 
 unaligned_copy:
-	if (len < THRESHOLD)
-		goto byte_copy;
-
 	/* possibly we are aligned on a word, but not on a double... */
 	if (likely(t1 & (sizeof(unsigned int)-1)) == 0) {
 		t2 = src & (sizeof(unsigned int) - 1);


More information about the parisc-linux-cvs mailing list