From 194576157afb34f7ce69cde800bf9715c730b39f Mon Sep 17 00:00:00 2001 From: Filip Navara Date: Thu, 15 Oct 2009 12:45:57 +0200 Subject: [PATCH] target-arm: convert NEON VZIP/VUZP/VTRN helper functions to pure TCG The neon_trn_u8, neon_trn_u16, neon_unzip_u8, neon_zip_u8 and neon_zip_u16 helpers used fixed registers to return values. This patch replaces that with TCG code, so T0/T1 is no longer directly used by the helper functions. Bugs in the gen_neon_unzip register load code were also fixed. Signed-off-by: Filip Navara Signed-off-by: Aurelien Jarno --- target-arm/helpers.h | 6 -- target-arm/op_helper.c | 58 ---------------- target-arm/translate.c | 151 ++++++++++++++++++++++++++++++++++++++--- 3 files changed, 142 insertions(+), 73 deletions(-) diff --git a/target-arm/helpers.h b/target-arm/helpers.h index 01175e157a..f298efff42 100644 --- a/target-arm/helpers.h +++ b/target-arm/helpers.h @@ -338,12 +338,6 @@ DEF_HELPER_2(neon_qneg_s8, i32, env, i32) DEF_HELPER_2(neon_qneg_s16, i32, env, i32) DEF_HELPER_2(neon_qneg_s32, i32, env, i32) -DEF_HELPER_0(neon_trn_u8, void) -DEF_HELPER_0(neon_trn_u16, void) -DEF_HELPER_0(neon_unzip_u8, void) -DEF_HELPER_0(neon_zip_u8, void) -DEF_HELPER_0(neon_zip_u16, void) - DEF_HELPER_2(neon_min_f32, i32, i32, i32) DEF_HELPER_2(neon_max_f32, i32, i32, i32) DEF_HELPER_2(neon_abd_f32, i32, i32, i32) diff --git a/target-arm/op_helper.c b/target-arm/op_helper.c index d4ae4ae7e3..5ac631df0e 100644 --- a/target-arm/op_helper.c +++ b/target-arm/op_helper.c @@ -495,61 +495,3 @@ uint64_t HELPER(neon_sub_saturate_u64)(uint64_t src1, uint64_t src2) } return res; } - -/* These need to return a pair of value, so still use T0/T1. */ -/* Transpose. Argument order is rather strange to avoid special casing - the tranlation code. - On input T0 = rm, T1 = rd. On output T0 = rd, T1 = rm */ -void HELPER(neon_trn_u8)(void) -{ - uint32_t rd; - uint32_t rm; - rd = ((T0 & 0x00ff00ff) << 8) | (T1 & 0x00ff00ff); - rm = ((T1 & 0xff00ff00) >> 8) | (T0 & 0xff00ff00); - T0 = rd; - T1 = rm; -} - -void HELPER(neon_trn_u16)(void) -{ - uint32_t rd; - uint32_t rm; - rd = (T0 << 16) | (T1 & 0xffff); - rm = (T1 >> 16) | (T0 & 0xffff0000); - T0 = rd; - T1 = rm; -} - -/* Worker routines for zip and unzip. */ -void HELPER(neon_unzip_u8)(void) -{ - uint32_t rd; - uint32_t rm; - rd = (T0 & 0xff) | ((T0 >> 8) & 0xff00) - | ((T1 << 16) & 0xff0000) | ((T1 << 8) & 0xff000000); - rm = ((T0 >> 8) & 0xff) | ((T0 >> 16) & 0xff00) - | ((T1 << 8) & 0xff0000) | (T1 & 0xff000000); - T0 = rd; - T1 = rm; -} - -void HELPER(neon_zip_u8)(void) -{ - uint32_t rd; - uint32_t rm; - rd = (T0 & 0xff) | ((T1 << 8) & 0xff00) - | ((T0 << 16) & 0xff0000) | ((T1 << 24) & 0xff000000); - rm = ((T0 >> 16) & 0xff) | ((T1 >> 8) & 0xff00) - | ((T0 >> 8) & 0xff0000) | (T1 & 0xff000000); - T0 = rd; - T1 = rm; -} - -void HELPER(neon_zip_u16)(void) -{ - uint32_t tmp; - - tmp = (T0 & 0xffff) | (T1 << 16); - T1 = (T1 & 0xffff0000) | (T0 >> 16); - T0 = tmp; -} diff --git a/target-arm/translate.c b/target-arm/translate.c index 8e5380cc6e..8c5afb7234 100644 --- a/target-arm/translate.c +++ b/target-arm/translate.c @@ -3627,24 +3627,157 @@ static inline void gen_neon_get_scalar(int size, int reg) } } +static void gen_neon_unzip_u8(TCGv t0, TCGv t1) +{ + TCGv rd, rm, tmp; + + rd = new_tmp(); + rm = new_tmp(); + tmp = new_tmp(); + + tcg_gen_andi_i32(rd, t0, 0xff); + tcg_gen_shri_i32(tmp, t0, 8); + tcg_gen_andi_i32(tmp, tmp, 0xff00); + tcg_gen_or_i32(rd, rd, tmp); + tcg_gen_shli_i32(tmp, t1, 16); + tcg_gen_andi_i32(tmp, tmp, 0xff0000); + tcg_gen_or_i32(rd, rd, tmp); + tcg_gen_shli_i32(tmp, t1, 8); + tcg_gen_andi_i32(tmp, tmp, 0xff000000); + tcg_gen_or_i32(rd, rd, tmp); + + tcg_gen_shri_i32(rm, t0, 8); + tcg_gen_andi_i32(rm, rm, 0xff); + tcg_gen_shri_i32(tmp, t0, 16); + tcg_gen_andi_i32(tmp, tmp, 0xff00); + tcg_gen_or_i32(rm, rm, tmp); + tcg_gen_shli_i32(tmp, t1, 8); + tcg_gen_andi_i32(tmp, tmp, 0xff0000); + tcg_gen_or_i32(rm, rm, tmp); + tcg_gen_andi_i32(tmp, t1, 0xff000000); + tcg_gen_or_i32(t1, rm, tmp); + tcg_gen_mov_i32(t0, rd); + + dead_tmp(tmp); + dead_tmp(rm); + dead_tmp(rd); +} + +static void gen_neon_zip_u8(TCGv t0, TCGv t1) +{ + TCGv rd, rm, tmp; + + rd = new_tmp(); + rm = new_tmp(); + tmp = new_tmp(); + + tcg_gen_andi_i32(rd, t0, 0xff); + tcg_gen_shli_i32(tmp, t1, 8); + tcg_gen_andi_i32(tmp, tmp, 0xff00); + tcg_gen_or_i32(rd, rd, tmp); + tcg_gen_shli_i32(tmp, t0, 16); + tcg_gen_andi_i32(tmp, tmp, 0xff0000); + tcg_gen_or_i32(rd, rd, tmp); + tcg_gen_shli_i32(tmp, t1, 24); + tcg_gen_andi_i32(tmp, tmp, 0xff000000); + tcg_gen_or_i32(rd, rd, tmp); + + tcg_gen_andi_i32(rm, t1, 0xff000000); + tcg_gen_shri_i32(tmp, t0, 8); + tcg_gen_andi_i32(tmp, tmp, 0xff0000); + tcg_gen_or_i32(rm, rm, tmp); + tcg_gen_shri_i32(tmp, t1, 8); + tcg_gen_andi_i32(tmp, tmp, 0xff00); + tcg_gen_or_i32(rm, rm, tmp); + tcg_gen_shri_i32(tmp, t0, 16); + tcg_gen_andi_i32(tmp, tmp, 0xff); + tcg_gen_or_i32(t1, rm, tmp); + tcg_gen_mov_i32(t0, rd); + + dead_tmp(tmp); + dead_tmp(rm); + dead_tmp(rd); +} + +static void gen_neon_zip_u16(TCGv t0, TCGv t1) +{ + TCGv tmp, tmp2; + + tmp = new_tmp(); + tmp2 = new_tmp(); + + tcg_gen_andi_i32(tmp, t0, 0xffff); + tcg_gen_shli_i32(tmp2, t1, 16); + tcg_gen_or_i32(tmp, tmp, tmp2); + tcg_gen_andi_i32(t1, t1, 0xffff0000); + tcg_gen_shri_i32(tmp2, t0, 16); + tcg_gen_or_i32(t1, t1, tmp2); + tcg_gen_mov_i32(t0, tmp); + + dead_tmp(tmp2); + dead_tmp(tmp); +} + static void gen_neon_unzip(int reg, int q, int tmp, int size) { int n; for (n = 0; n < q + 1; n += 2) { NEON_GET_REG(T0, reg, n); - NEON_GET_REG(T0, reg, n + n); + NEON_GET_REG(T1, reg, n + 1); switch (size) { - case 0: gen_helper_neon_unzip_u8(); break; - case 1: gen_helper_neon_zip_u16(); break; /* zip and unzip are the same. */ + case 0: gen_neon_unzip_u8(cpu_T[0], cpu_T[1]); break; + case 1: gen_neon_zip_u16(cpu_T[0], cpu_T[1]); break; /* zip and unzip are the same. */ case 2: /* no-op */; break; default: abort(); } - gen_neon_movl_scratch_T0(tmp + n); - gen_neon_movl_scratch_T1(tmp + n + 1); + gen_neon_movl_T0_scratch(tmp + n); + gen_neon_movl_T1_scratch(tmp + n + 1); } } +static void gen_neon_trn_u8(TCGv t0, TCGv t1) +{ + TCGv rd, tmp; + + rd = new_tmp(); + tmp = new_tmp(); + + tcg_gen_shli_i32(rd, t0, 8); + tcg_gen_andi_i32(rd, rd, 0xff00ff00); + tcg_gen_andi_i32(tmp, t1, 0x00ff00ff); + tcg_gen_or_i32(rd, rd, tmp); + + tcg_gen_shri_i32(t1, t1, 8); + tcg_gen_andi_i32(t1, t1, 0x00ff00ff); + tcg_gen_andi_i32(tmp, t0, 0xff00ff00); + tcg_gen_or_i32(t1, t1, tmp); + tcg_gen_mov_i32(t0, rd); + + dead_tmp(tmp); + dead_tmp(rd); +} + +static void gen_neon_trn_u16(TCGv t0, TCGv t1) +{ + TCGv rd, tmp; + + rd = new_tmp(); + tmp = new_tmp(); + + tcg_gen_shli_i32(rd, t0, 16); + tcg_gen_andi_i32(tmp, t1, 0xffff); + tcg_gen_or_i32(rd, rd, tmp); + tcg_gen_shri_i32(t1, t1, 16); + tcg_gen_andi_i32(tmp, t0, 0xffff0000); + tcg_gen_or_i32(t1, t1, tmp); + tcg_gen_mov_i32(t0, rd); + + dead_tmp(tmp); + dead_tmp(rd); +} + + static struct { int nregs; int interleave; @@ -5256,8 +5389,8 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn) NEON_GET_REG(T0, rd, n); NEON_GET_REG(T1, rd, n); switch (size) { - case 0: gen_helper_neon_zip_u8(); break; - case 1: gen_helper_neon_zip_u16(); break; + case 0: gen_neon_zip_u8(cpu_T[0], cpu_T[1]); break; + case 1: gen_neon_zip_u16(cpu_T[0], cpu_T[1]); break; case 2: /* no-op */; break; default: abort(); } @@ -5442,8 +5575,8 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn) case 33: /* VTRN */ NEON_GET_REG(T1, rd, pass); switch (size) { - case 0: gen_helper_neon_trn_u8(); break; - case 1: gen_helper_neon_trn_u16(); break; + case 0: gen_neon_trn_u8(cpu_T[0], cpu_T[1]); break; + case 1: gen_neon_trn_u16(cpu_T[0], cpu_T[1]); break; case 2: abort(); default: return 1; }