bcopy.s 7.94 KB
/*
 * Fast bcopy code which supports overlapped copies.
 * Not fully optimized yet.
 *
 * Written by: Kipp Hickman
 *
 * $Source: /root/leakn64/depot/rf/sw/n64os20l/libultra/monegi/libc/bcopy.s,v $
 * $Revision: 1.1.1.2 $
 * $Date: 2002/10/29 08:06:43 $
 */

#define ISBCOPY

#ifdef	ISBCOPY
.weakext bcopy, _bcopy
#else
/* memcpy is ansi defined so no weak symbol is needed */
#endif

#include <asm.h>
#include <regdef.h>

/*
 * char *bcopy(from, to, count);
 *	unsigned char *from, *to;
 *	unsigned long count;
 *
 * OR
 *
 * void *memcpy/memmove(to, from, count);
 *	void *to, *from;
 *	unsigned long count;
 *
 * Both functions return "to"
 */

#define	MINCOPY	16

/* registers used */

#ifdef	ISBCOPY
#define	from	a0
#define	to	a1
#define	count	a2
LEAF(_bcopy)
#else
#define	to	a0
#define	from	a1
#define	count	a2
LEAF(memcpy)
XLEAF(memmove)
#endif
	move	a3,to			# Save to in a3
	beq	count,zero,ret		# Test for zero count
	beq	from,to,ret		# Test for from == to

	/* use backwards copying code if the from and to regions overlap */
	blt	to,from,goforwards	# If to < from then use forwards copy
	add	v0,from,count		# v0 := from + count
	bge	to,v0,goforwards	# If to >= from + count; no overlap
	b	gobackwards		# Oh well, go backwards

/*****************************************************************************/

/*
 * Forward copy code.  Check for pointer alignment and try to get both
 * pointers aligned on a long boundary.
 */
goforwards:
	/* small byte counts use byte at a time copy */
	blt	count,MINCOPY,forwards_bytecopy
	and	v0,from,3		# v0 := from & 3
	and	v1,to,3			# v1 := to & 3
	beq	v0,v1,forwalignable	# low bits are identical
/*
 * Byte at a time copy code.  This is used when the pointers are not
 * alignable, when the byte count is small, or when cleaning up any
 * remaining bytes on a larger transfer.
 */
forwards_bytecopy:
	beq	count,zero,ret		# If count is zero, then we are done
	addu	v1,from,count		# v1 := from + count

99:	lb	v0,0(from)		# v0 = *from
	addu	from,1			# advance pointer
	sb	v0,0(to)		# Store byte
	addu	to,1			# advance pointer
	bne	from,v1,99b		# Loop until done
ret:	move	v0,a3			# Set v0 to old "to" pointer
	j	ra			# return to caller

/*
 * Pointers are alignable, and may be aligned.  Since v0 == v1, we need only
 * check what value v0 has to see how to get aligned.  Also, since we have
 * eliminated tiny copies, we know that the count is large enough to
 * encompass the alignment copies.
 */
forwalignable:
	beq	v0,zero,forwards	# If v0==v1 && v0==0 then aligned
	beq	v0,1,forw_copy3		# Need to copy 3 bytes to get aligned
	beq	v0,2,forw_copy2		# Need to copy 2 bytes to get aligned

/* need to copy 1 byte */
	lb	v0,0(from)		# get one byte
	addu	from,1			#  advance pointer
	sb	v0,0(to)		#   store one byte
	addu	to,1			#    advance pointer
	subu	count,1			#     and reduce count
	b	forwards		# Now pointers are aligned

/* need to copy 2 bytes */
forw_copy2:
	lh	v0,0(from)		# get one short
	addu	from,2			#  advance pointer
	sh	v0,0(to)		#   store one short
	addu	to,2			#    advance pointer
	subu	count,2			#     and reduce count
	b	forwards

/* need to copy 3 bytes */
forw_copy3:
	lb	v0,0(from)		# get one byte
	lh	v1,1(from)		#  and one short
	addu	from,3			#  advance pointer
	sb	v0,0(to)		#   store one byte
	sh	v1,1(to)		#    and one short
	addu	to,3			#    advance pointer
	subu	count,3			#     and reduce count
	/* FALLTHROUGH */
/*
 * Once we are here, the pointers are aligned on long boundaries.
 * Begin copying in large chunks.
 */
forwards:

/* 32 byte at a time loop */
forwards_32:
	blt	count,32,forwards_16	# do 16 bytes at a time
	lw	v0,0(from)
	lw	v1,4(from)
	lw	t0,8(from)
	lw	t1,12(from)
	lw	t2,16(from)
	lw	t3,20(from)
	lw	ta0,24(from)
	lw	ta1,28(from)		# Fetch 8*4 bytes
	addu	from,32			# advance from pointer now
	sw	v0,0(to)
	sw	v1,4(to)
	sw	t0,8(to)
	sw	t1,12(to)
	sw	t2,16(to)
	sw	t3,20(to)
	sw	ta0,24(to)
	sw	ta1,28(to)		# Store 8*4 bytes
	addu	to,32			# advance to pointer now
	subu	count,32		# Reduce count
	b	forwards_32		# Try some more

/* 16 byte at a time loop */
forwards_16:
	blt	count,16,forwards_4	# Do rest in words
	lw	v0,0(from)
	lw	v1,4(from)
	lw	t0,8(from)
	lw	t1,12(from)
	addu	from,16			# advance from pointer now
	sw	v0,0(to)
	sw	v1,4(to)
	sw	t0,8(to)
	sw	t1,12(to)
	addu	to,16			# advance to pointer now
	subu	count,16		# Reduce count
	b	forwards_16		# Try some more

/* 4 bytes at a time loop */
forwards_4:
	blt	count,4,forwards_bytecopy	# Do rest
	lw	v0,0(from)
	addu	from,4			# advance pointer
	sw	v0,0(to)
	addu	to,4			# advance pointer
	subu	count,4
	b	forwards_4

/*****************************************************************************/

/*
 * Backward copy code.  Check for pointer alignment and try to get both
 * pointers aligned on a long boundary.
 */
gobackwards:
	add	from,count		# Advance to end + 1
	add	to,count		# Advance to end + 1

	/* small byte counts use byte at a time copy */
	blt	count,MINCOPY,backwards_bytecopy
	and	v0,from,3		# v0 := from & 3
	and	v1,to,3			# v1 := to & 3
	beq	v0,v1,backalignable	# low bits are identical
/*
 * Byte at a time copy code.  This is used when the pointers are not
 * alignable, when the byte count is small, or when cleaning up any
 * remaining bytes on a larger transfer.
 */
backwards_bytecopy:
	beq	count,zero,ret		# If count is zero quit
	subu	from,1			# Reduce by one (point at byte)
	subu	to,1			# Reduce by one (point at byte)
	subu	v1,from,count		# v1 := original from - 1

99:	lb	v0,0(from)		# v0 = *from
	subu	from,1			# backup pointer
	sb	v0,0(to)		# Store byte
	subu	to,1			# backup pointer
	bne	from,v1,99b		# Loop until done
	move	v0,a3			# Set v0 to old "to" pointer
	j	ra			# return to caller

/*
 * Pointers are alignable, and may be aligned.  Since v0 == v1, we need only
 * check what value v0 has to see how to get aligned.  Also, since we have
 * eliminated tiny copies, we know that the count is large enough to
 * encompass the alignment copies.
 */
backalignable:
	beq	v0,zero,backwards	# If v0==v1 && v0==0 then aligned
	beq	v0,3,back_copy3		# Need to copy 3 bytes to get aligned
	beq	v0,2,back_copy2		# Need to copy 2 bytes to get aligned

/* need to copy 1 byte */
	lb	v0,-1(from)		# get one byte
	subu	from,1			# backup pointer
	sb	v0,-1(to)		# store one byte
	subu	to,1			# backup pointer
	subu	count,1			#  and reduce count
	b	backwards		# Now pointers are aligned

/* need to copy 2 bytes */
back_copy2:
	lh	v0,-2(from)		# get one short
	subu	from,2			# backup pointer
	sh	v0,-2(to)		# store one short
	subu	to,2			# backup pointer
	subu	count,2			#  and reduce count
	b	backwards

/* need to copy 3 bytes */
back_copy3:
	lb	v0,-1(from)		# get one byte
	lh	v1,-3(from)		#  and one short
	subu	from,3			# backup pointer
	sb	v0,-1(to)		#  store one byte
	sh	v1,-3(to)		#   and one short
	subu	to,3			# backup pointer
	subu	count,3			#  and reduce count
	/* FALLTHROUGH */
/*
 * Once we are here, the pointers are aligned on long boundaries.
 * Begin copying in large chunks.
 */
backwards:

/* 32 byte at a time loop */
backwards_32:
	blt	count,32,backwards_16	# do 16 bytes at a time
	lw	v0,-4(from)
	lw	v1,-8(from)
	lw	t0,-12(from)
	lw	t1,-16(from)
	lw	t2,-20(from)
	lw	t3,-24(from)
	lw	ta0,-28(from)
	lw	ta1,-32(from)		# Fetch 8*4 bytes
	subu	from,32			# backup from pointer now
	sw	v0,-4(to)
	sw	v1,-8(to)
	sw	t0,-12(to)
	sw	t1,-16(to)
	sw	t2,-20(to)
	sw	t3,-24(to)
	sw	ta0,-28(to)
	sw	ta1,-32(to)		# Store 8*4 bytes
	subu	to,32			# backup to pointer now
	subu	count,32		# Reduce count
	b	backwards_32		# Try some more

/* 16 byte at a time loop */
backwards_16:
	blt	count,16,backwards_4	# Do rest in words
	lw	v0,-4(from)
	lw	v1,-8(from)
	lw	t0,-12(from)
	lw	t1,-16(from)
	subu	from,16			# backup from pointer now
	sw	v0,-4(to)
	sw	v1,-8(to)
	sw	t0,-12(to)
	sw	t1,-16(to)
	subu	to,16			# backup to pointer now
	subu	count,16		# Reduce count
	b	backwards_16		# Try some more

/* 4 byte at a time loop */
backwards_4:
	blt	count,4,backwards_bytecopy	# Do rest
	lw	v0,-4(from)
	subu	from,4			# backup from pointer
	sw	v0,-4(to)
	subu	to,4			# backup to pointer
	subu	count,4			# Reduce count
	b	backwards_4
	END(bcopy)