Max 5 API Reference
00001 /* 00002 00003 jit.op.simd.h 00004 00005 Copyright 2001-2005 - Cycling '74 00006 Joshua Kit Clayton jkc@cycling74.com 00007 00008 C macros useful for SIMD development 00009 00010 */ 00011 00012 #ifndef _JIT_OP_SIMD_H_ 00013 #define _JIT_OP_SIMD_H_ 00014 00015 //#ifdef WIN32 00016 //#include "jit.sse.h" 00017 //#else 00018 #include "jit.altivec.h" 00019 //#endif 00020 00021 00022 #ifndef WIN32 00023 00024 #define JIT_OP_CHAR_USE_SIMD 1 00025 #define JIT_OP_LONG_USE_SIMD 0 00026 #define JIT_OP_FLOAT32_USE_SIMD 0 00027 00028 #else 00029 00030 #define JIT_OP_CHAR_USE_SIMD 0 00031 #define JIT_OP_LONG_USE_SIMD 0 00032 #define JIT_OP_FLOAT32_USE_SIMD 0 00033 00034 #endif //WIN32 00035 00036 00037 // ---------------------------------------------------- 00038 // Altivec binary op macros 00039 00040 00041 // The preamble assumes input ip0, and ip1, and output op pointers. 00042 // a0-3 are the ip0 vector registers 00043 // b0-3 are the ip1 vector registers 00044 // d0-3 are the op vector registers 00045 // c0-3 and t0-3 are temporary vector registers for use in your work macro (not always used) 00046 // sin0 and sin1 are scalar values for input, and st0 and st1 are scalar temps (not always used) 00047 // we also setup constants used for the unrolled loops of processing 4 vectors at 00048 // a time (VLOOP4), then 1 vector at a time (VLOOP1), then 1 scalar at a time (SLOOP) 00049 // >>2 is div 4 and &= 0x03 is mod 4 00050 00051 00052 #define JIT_OP_ALTIVEC_BINARY_PREAMBLE(vtype) \ 00053 __vector vtype *v_ip0=(__vector vtype *)ip0; \ 00054 __vector vtype *v_ip1=(__vector vtype *)ip1; \ 00055 __vector vtype *v_op=(__vector vtype *)op; \ 00056 __vector vtype a0; \ 00057 __vector vtype a1; \ 00058 __vector vtype a2; \ 00059 __vector vtype a3; \ 00060 __vector vtype b0; \ 00061 __vector vtype b1; \ 00062 __vector vtype b2; \ 00063 __vector vtype b3; \ 00064 __vector vtype c0; \ 00065 __vector vtype c1; \ 00066 __vector vtype c2; \ 00067 __vector vtype c3; \ 00068 __vector vtype d0; \ 00069 __vector vtype d1; \ 00070 __vector vtype d2; \ 00071 __vector vtype d3; \ 00072 __vector vtype t0; \ 00073 __vector vtype t1; \ 00074 __vector vtype t2; \ 00075 __vector vtype t3; \ 00076 vtype sin0,sin1,st0,st1; \ 00077 long v_steps,v_steps2,i,velemcount; \ 00078 UInt32 prefetchSize = AltivecGetPrefetchConstant(16, 1, 256); \ 00079 \ 00080 velemcount = 16/sizeof(vtype); \ 00081 v_steps = (n/velemcount); \ 00082 v_steps2 = v_steps>>2; \ 00083 n = (n%velemcount); \ 00084 \ 00085 ip0 += v_steps*velemcount; \ 00086 ip1 += v_steps*velemcount; \ 00087 op += v_steps*velemcount; \ 00088 v_steps &= 0x03; 00089 00090 00091 #define JIT_OP_ALTIVEC_BINARY_VLOOP4_LOAD(vtype) \ 00092 vec_dstt( v_ip0, prefetchSize, 0 ); \ 00093 vec_dstt( v_ip1, prefetchSize, 1 ); \ 00094 \ 00095 a0 = vec_ldl(0, v_ip0 ); \ 00096 a1 = vec_ldl(1 * sizeof(__vector vtype), v_ip0); \ 00097 a2 = vec_ldl(2 * sizeof(__vector vtype), v_ip0); \ 00098 a3 = vec_ldl(3 * sizeof(__vector vtype), v_ip0); \ 00099 \ 00100 b0 = vec_ldl(0, v_ip1 ); \ 00101 b1 = vec_ldl(1 * sizeof(__vector vtype), v_ip1); \ 00102 b2 = vec_ldl(2 * sizeof(__vector vtype), v_ip1); \ 00103 b3 = vec_ldl(3 * sizeof(__vector vtype), v_ip1); 00104 00105 00106 #define JIT_OP_ALTIVEC_BINARY_VLOOP4_WORK_OP(voperator) \ 00107 d0 = voperator (a0,b0); \ 00108 d1 = voperator (a1,b1); \ 00109 d2 = voperator (a2,b2); \ 00110 d3 = voperator (a3,b3); 00111 00112 00113 #define JIT_OP_ALTIVEC_BINARY_VLOOP4_STORE(vtype) \ 00114 vec_st(d0, 0, v_op); \ 00115 vec_st(d1, 1 * sizeof(__vector vtype), v_op); \ 00116 vec_st(d2, 2 * sizeof(__vector vtype), v_op); \ 00117 vec_st(d3, 3 * sizeof(__vector vtype), v_op); \ 00118 \ 00119 v_ip0 += 4; \ 00120 v_ip1 += 4; \ 00121 v_op += 4; 00122 00123 00124 #define JIT_OP_ALTIVEC_BINARY_VLOOP4_GENERIC(vtype,vwork4) \ 00125 for (i=0;i<v_steps2;i++) { \ 00126 JIT_OP_ALTIVEC_BINARY_VLOOP4_LOAD(vtype); \ 00127 vwork4; \ 00128 JIT_OP_ALTIVEC_BINARY_VLOOP4_STORE(vtype); \ 00129 } 00130 00131 #define JIT_OP_ALTIVEC_BINARY_VLOOP4(vtype,voperator) \ 00132 JIT_OP_ALTIVEC_BINARY_VLOOP4_GENERIC(vtype,JIT_OP_ALTIVEC_BINARY_VLOOP4_WORK_OP(voperator)); 00133 00134 00135 #define JIT_OP_ALTIVEC_BINARY_VLOOP1_WORK_OP(voperator) \ 00136 v_op[i] = voperator (v_ip0[i],v_ip1[i]); 00137 00138 00139 #define JIT_OP_ALTIVEC_BINARY_VLOOP1_GENERIC(vtype,vwork1) \ 00140 for (i=0;i<v_steps;i++) { \ 00141 vwork1; \ 00142 } 00143 00144 00145 #define JIT_OP_ALTIVEC_BINARY_VLOOP1(vtype,voperator) \ 00146 JIT_OP_ALTIVEC_BINARY_VLOOP1_GENERIC(vtype,JIT_OP_ALTIVEC_BINARY_VLOOP1_WORK_OP(voperator)); 00147 00148 00149 #define JIT_OP_ALTIVEC_BINARY_SLOOP_WORK_OP(soperator) \ 00150 sin0 = *ip0++; \ 00151 sin1 = *ip1++; \ 00152 *op++ = soperator (sin0,sin1); \ 00153 00154 00155 #define JIT_OP_ALTIVEC_BINARY_SLOOP_GENERIC(stype,swork) \ 00156 while (n--) { \ 00157 swork; \ 00158 } 00159 00160 00161 #define JIT_OP_ALTIVEC_BINARY_SLOOP(stype,soperator) \ 00162 JIT_OP_ALTIVEC_BINARY_SLOOP_GENERIC(stype,JIT_OP_ALTIVEC_BINARY_SLOOP_WORK_OP(soperator)); 00163 00164 00165 #define JIT_OP_ALTIVEC_BINARY_LOOPS(vtype,voperator,soperator) \ 00166 JIT_OP_ALTIVEC_BINARY_VLOOP4(vtype,voperator); \ 00167 JIT_OP_ALTIVEC_BINARY_VLOOP1(vtype,voperator); \ 00168 JIT_OP_ALTIVEC_BINARY_SLOOP(vtype,soperator); 00169 00170 #define JIT_OP_ALTIVEC_BINARY_BIG(vtype,voperator,soperator) \ 00171 JIT_OP_ALTIVEC_BINARY_PREAMBLE(vtype); \ 00172 JIT_OP_ALTIVEC_BINARY_LOOPS(vtype,voperator,soperator); 00173 00174 #define JIT_OP_ALTIVEC_BINARY_BIG_ZERO_T0(vtype,voperator,soperator) \ 00175 JIT_OP_ALTIVEC_BINARY_PREAMBLE(vtype); \ 00176 JIT_OP_ALTIVEC_VEC_ZERO(vtype,t0) ; \ 00177 JIT_OP_ALTIVEC_BINARY_LOOPS(vtype,voperator,soperator); 00178 00179 00180 // ---------------------------------------------------- 00181 // Altivec unary op macros 00182 00183 00184 #define JIT_OP_ALTIVEC_UNARY_PREAMBLE(vtype) \ 00185 __vector vtype *v_ip0=(__vector vtype *)ip0; \ 00186 __vector vtype *v_op=(__vector vtype *)op; \ 00187 __vector vtype a0; \ 00188 __vector vtype a1; \ 00189 __vector vtype a2; \ 00190 __vector vtype a3; \ 00191 __vector vtype b0; \ 00192 __vector vtype b1; \ 00193 __vector vtype b2; \ 00194 __vector vtype b3; \ 00195 __vector vtype c0; \ 00196 __vector vtype c1; \ 00197 __vector vtype c2; \ 00198 __vector vtype c3; \ 00199 __vector vtype d0; \ 00200 __vector vtype d1; \ 00201 __vector vtype d2; \ 00202 __vector vtype d3; \ 00203 __vector vtype t0; \ 00204 __vector vtype t1; \ 00205 __vector vtype t2; \ 00206 __vector vtype t3; \ 00207 vtype sin0,sin1,st0,st1; \ 00208 long v_steps,v_steps2,i,velemcount; \ 00209 UInt32 prefetchSize = AltivecGetPrefetchConstant(16, 1, 256); \ 00210 \ 00211 velemcount = 16/sizeof(vtype); \ 00212 v_steps = (n/velemcount); \ 00213 v_steps2 = v_steps>>2; \ 00214 n = (n%velemcount); \ 00215 \ 00216 ip0 += v_steps*velemcount; \ 00217 op += v_steps*velemcount; \ 00218 v_steps &= 0x03; 00219 00220 00221 #define JIT_OP_ALTIVEC_UNARY_VLOOP4_LOAD(vtype) \ 00222 vec_dstt( v_ip0, prefetchSize, 0 ); \ 00223 \ 00224 a0 = vec_ldl(0, v_ip0 ); \ 00225 a1 = vec_ldl(1 * sizeof(__vector vtype), v_ip0); \ 00226 a2 = vec_ldl(2 * sizeof(__vector vtype), v_ip0); \ 00227 a3 = vec_ldl(3 * sizeof(__vector vtype), v_ip0); \ 00228 00229 00230 #define JIT_OP_ALTIVEC_UNARY_VLOOP4_WORK_OP(voperator) \ 00231 d0 = voperator (a0); \ 00232 d1 = voperator (a1); \ 00233 d2 = voperator (a2); \ 00234 d3 = voperator (a3); 00235 00236 00237 #define JIT_OP_ALTIVEC_UNARY_VLOOP4_STORE(vtype) \ 00238 vec_st(d0, 0, v_op); \ 00239 vec_st(d1, 1 * sizeof(__vector vtype), v_op); \ 00240 vec_st(d2, 2 * sizeof(__vector vtype), v_op); \ 00241 vec_st(d3, 3 * sizeof(__vector vtype), v_op); \ 00242 \ 00243 v_ip0 += 4; \ 00244 v_op += 4; 00245 00246 00247 #define JIT_OP_ALTIVEC_UNARY_VLOOP4_GENERIC(vtype,vwork4) \ 00248 for (i=0;i<v_steps2;i++) { \ 00249 JIT_OP_ALTIVEC_UNARY_VLOOP4_LOAD(vtype); \ 00250 vwork4; \ 00251 JIT_OP_ALTIVEC_UNARY_VLOOP4_STORE(vtype); \ 00252 } 00253 00254 #define JIT_OP_ALTIVEC_UNARY_VLOOP4(vtype,voperator) \ 00255 JIT_OP_ALTIVEC_UNARY_VLOOP4_GENERIC(vtype,JIT_OP_ALTIVEC_UNARY_VLOOP4_WORK_OP(voperator)); 00256 00257 00258 #define JIT_OP_ALTIVEC_UNARY_VLOOP1_WORK_OP(voperator) \ 00259 v_op[i] = voperator (v_ip0[i]); 00260 00261 00262 #define JIT_OP_ALTIVEC_UNARY_VLOOP1_GENERIC(vtype,vwork1) \ 00263 for (i=0;i<v_steps;i++) { \ 00264 vwork1; \ 00265 } 00266 00267 00268 #define JIT_OP_ALTIVEC_UNARY_VLOOP1(vtype,voperator) \ 00269 JIT_OP_ALTIVEC_UNARY_VLOOP1_GENERIC(vtype,JIT_OP_ALTIVEC_UNARY_VLOOP1_WORK_OP(voperator)); 00270 00271 00272 #define JIT_OP_ALTIVEC_UNARY_SLOOP_WORK_OP(soperator) \ 00273 sin0 = *ip0++; \ 00274 *op++ = soperator (sin0); \ 00275 00276 00277 #define JIT_OP_ALTIVEC_UNARY_SLOOP_GENERIC(stype,swork) \ 00278 while (n--) { \ 00279 swork; \ 00280 } 00281 00282 00283 #define JIT_OP_ALTIVEC_UNARY_SLOOP(stype,soperator) \ 00284 JIT_OP_ALTIVEC_UNARY_SLOOP_GENERIC(stype,JIT_OP_ALTIVEC_UNARY_SLOOP_WORK_OP(soperator)); 00285 00286 00287 #define JIT_OP_ALTIVEC_UNARY_LOOPS(vtype,voperator,soperator) \ 00288 JIT_OP_ALTIVEC_UNARY_VLOOP4(vtype,voperator); \ 00289 JIT_OP_ALTIVEC_UNARY_VLOOP1(vtype,voperator); \ 00290 JIT_OP_ALTIVEC_UNARY_SLOOP(vtype,soperator); 00291 00292 00293 #define JIT_OP_ALTIVEC_UNARY_BIG(vtype,voperator,soperator) \ 00294 JIT_OP_ALTIVEC_UNARY_PREAMBLE(vtype); \ 00295 JIT_OP_ALTIVEC_UNARY_LOOPS(vtype,voperator,soperator); 00296 00297 00298 #define JIT_OP_ALTIVEC_UNARY_BIG_ZERO_T0(vtype,voperator,soperator) \ 00299 JIT_OP_ALTIVEC_UNARY_PREAMBLE(vtype); \ 00300 JIT_OP_ALTIVEC_VEC_ZERO(vtype,t0) ; \ 00301 JIT_OP_ALTIVEC_UNARY_LOOPS(vtype,voperator,soperator); 00302 00303 00304 // various utility macros used in jit.op operators 00305 00306 //cheap way to make zero vector 00307 #define JIT_OP_ALTIVEC_VEC_ZERO(vtype,v) (v) = (__vector vtype) vec_andc((__vector unsigned int)(v),(__vector unsigned int)(v)) 00308 // fill vector with a scalar 00309 #define JIT_OP_ALTIVEC_VEC_FILL16(vtype,v,a) (v) = (__vector vtype) ((a),(a),(a),(a),(a),(a),(a),(a),(a),(a),(a),(a),(a),(a),(a),(a)) 00310 #define JIT_OP_ALTIVEC_VEC_FILL8(vtype,v,a) (v) = (__vector vtype) ((a),(a),(a),(a),(a),(a),(a),(a)) 00311 #define JIT_OP_ALTIVEC_VEC_FILL4(vtype,v,a) (v) = (__vector vtype) ((a),(a),(a),(a)) 00312 // set vector based on N scalars 00313 #define JIT_OP_ALTIVEC_VEC_SET16(vtype,v,a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \ 00314 (v) = (__vector vtype) ((a),(b),(c),(d),(e),(f),(g),(h),(i),(j),(k),(l),(m),(n),(o),(p)) 00315 #define JIT_OP_ALTIVEC_VEC_SET8(vtype,v,a,b,c,d,e,f,g,h) \ 00316 (v) = (__vector vtype) ((a),(b),(c),(d),(e),(f),(g),(h)) 00317 #define JIT_OP_ALTIVEC_VEC_SET4(vtype,v,a,b,c,d) \ 00318 (v) = (__vector vtype) ((a),(b),(c),(d)) 00319 // multiplication using t0 vector in add of multiply/add function 00320 #define JIT_OP_ALTIVEC_MULT_T0(a,b) vec_madd((a),(b),t0) 00321 // bitwise ops using t0 vector in other register 00322 #define JIT_OP_ALTIVEC_BITAND_T0(a) vec_and((a),t0) 00323 #define JIT_OP_ALTIVEC_BITANDC_T0(a) vec_andc(t0,(a)) 00324 #define JIT_OP_ALTIVEC_BITOR_T0(a) vec_or((a),t0) 00325 #define JIT_OP_ALTIVEC_BITXOR_T0(a) vec_xor((a),t0) 00326 // boolean ops using t0 vector in other register 00327 #define JIT_OP_ALTIVEC_CMPEQ_T0(a) vec_cmpeq((a),t0) 00328 #define JIT_OP_ALTIVEC_CMPEQ_T0_ULONG(a) (__vector unsigned int)vec_cmpeq((a),t0) 00329 #define JIT_OP_ALTIVEC_CMPEQ_T0_LONG(a) (__vector int)vec_cmpeq((a),t0) 00330 #define JIT_OP_ALTIVEC_CMPEQ_T0_FLOAT(a) (__vector float)vec_cmpeq((a),t0) 00331 // cast vector operator 00332 #define JIT_OP_ALTIVEC_CAST_OP(vtype,voperator) (__vector vtype)voperator 00333 00334 00335 // Boolean logic operator macros 00336 #define JIT_OP_ALTIVEC_BINARY_VLOOP4_WORK_OP_BOOL(vtype,voperator) \ 00337 c0 = (__vector vtype) vec_cmpeq(a0,t0); \ 00338 c1 = (__vector vtype) vec_cmpeq(a1,t0); \ 00339 c2 = (__vector vtype) vec_cmpeq(a2,t0); \ 00340 c3 = (__vector vtype) vec_cmpeq(a3,t0); \ 00341 \ 00342 a0 = (__vector vtype) vec_cmpeq(b0,t0); \ 00343 a1 = (__vector vtype) vec_cmpeq(b1,t0); \ 00344 a2 = (__vector vtype) vec_cmpeq(b2,t0); \ 00345 a3 = (__vector vtype) vec_cmpeq(b3,t0); \ 00346 \ 00347 b0 = (__vector vtype) vec_cmpeq(c0,t0); \ 00348 b1 = (__vector vtype) vec_cmpeq(c1,t0); \ 00349 b2 = (__vector vtype) vec_cmpeq(c2,t0); \ 00350 b3 = (__vector vtype) vec_cmpeq(c3,t0); \ 00351 \ 00352 c0 = (__vector vtype) vec_cmpeq(a0,t0); \ 00353 c1 = (__vector vtype) vec_cmpeq(a1,t0); \ 00354 c2 = (__vector vtype) vec_cmpeq(a2,t0); \ 00355 c3 = (__vector vtype) vec_cmpeq(a3,t0); \ 00356 \ 00357 d0 = voperator (b0,c0); \ 00358 d1 = voperator (b1,c1); \ 00359 d2 = voperator (b2,c2); \ 00360 d3 = voperator (b3,c3); 00361 00362 #define JIT_OP_ALTIVEC_BINARY_VLOOP1_WORK_OP_BOOL(vtype,voperator) \ 00363 a0 = (__vector vtype) vec_cmpeq(v_ip0[i],t0); \ 00364 b0 = (__vector vtype) vec_cmpeq(a0,t0); \ 00365 c0 = (__vector vtype) vec_cmpeq(v_ip1[i],t0); \ 00366 a0 = (__vector vtype) vec_cmpeq(c0,t0); \ 00367 v_op[i] = voperator (a0,b0); 00368 00369 #define JIT_OP_ALTIVEC_BINARY_BIG_BOOL(vtype,voperator,soperator) \ 00370 JIT_OP_ALTIVEC_BINARY_PREAMBLE(vtype); \ 00371 JIT_OP_ALTIVEC_VEC_ZERO(vtype,t0) ; \ 00372 JIT_OP_ALTIVEC_BINARY_VLOOP4_GENERIC(vtype,JIT_OP_ALTIVEC_BINARY_VLOOP4_WORK_OP_BOOL(vtype,voperator)); \ 00373 JIT_OP_ALTIVEC_BINARY_VLOOP1_GENERIC(vtype,JIT_OP_ALTIVEC_BINARY_VLOOP1_WORK_OP_BOOL(vtype,voperator)); \ 00374 JIT_OP_ALTIVEC_BINARY_SLOOP(vtype,soperator); 00375 00376 00377 #define JIT_OP_ALTIVEC_BINARY_VLOOP4_WORK_OP_BOOL_MASK_T1(vtype,voperator) \ 00378 c0 = (__vector vtype) vec_cmpeq(a0,t0); \ 00379 c1 = (__vector vtype) vec_cmpeq(a1,t0); \ 00380 c2 = (__vector vtype) vec_cmpeq(a2,t0); \ 00381 c3 = (__vector vtype) vec_cmpeq(a3,t0); \ 00382 \ 00383 a0 = (__vector vtype) vec_cmpeq(b0,t0); \ 00384 a1 = (__vector vtype) vec_cmpeq(b1,t0); \ 00385 a2 = (__vector vtype) vec_cmpeq(b2,t0); \ 00386 a3 = (__vector vtype) vec_cmpeq(b3,t0); \ 00387 \ 00388 b0 = (__vector vtype) vec_cmpeq(c0,t0); \ 00389 b1 = (__vector vtype) vec_cmpeq(c1,t0); \ 00390 b2 = (__vector vtype) vec_cmpeq(c2,t0); \ 00391 b3 = (__vector vtype) vec_cmpeq(c3,t0); \ 00392 \ 00393 c0 = (__vector vtype) vec_cmpeq(a0,t0); \ 00394 c1 = (__vector vtype) vec_cmpeq(a1,t0); \ 00395 c2 = (__vector vtype) vec_cmpeq(a2,t0); \ 00396 c3 = (__vector vtype) vec_cmpeq(a3,t0); \ 00397 \ 00398 a0 = voperator (b0,c0); \ 00399 a1 = voperator (b1,c1); \ 00400 a2 = voperator (b2,c2); \ 00401 a3 = voperator (b3,c3); \ 00402 \ 00403 d0 = vec_and(a0,t1); \ 00404 d1 = vec_and(a1,t1); \ 00405 d2 = vec_and(a2,t1); \ 00406 d3 = vec_and(a3,t1); \ 00407 00408 00409 #define JIT_OP_ALTIVEC_BINARY_VLOOP1_WORK_OP_BOOL_MASK_T1(vtype,voperator) \ 00410 a0 = (__vector vtype) vec_cmpeq(v_ip0[i],t0); \ 00411 b0 = (__vector vtype) vec_cmpeq(a0,t0); \ 00412 c0 = (__vector vtype) vec_cmpeq(v_ip1[i],t0); \ 00413 a0 = (__vector vtype) vec_cmpeq(c0,t0); \ 00414 c0 = voperator (a0,b0); \ 00415 d0 = vec_and(c0,t1); 00416 00417 00418 #define JIT_OP_ALTIVEC_BINARY_BIG_BOOL_LONG(vtype,voperator,soperator) \ 00419 JIT_OP_ALTIVEC_BINARY_PREAMBLE(vtype); \ 00420 JIT_OP_ALTIVEC_VEC_ZERO(vtype,t0); \ 00421 JIT_OP_ALTIVEC_VEC_FILL4(vtype,t1,0x00000001); \ 00422 JIT_OP_ALTIVEC_BINARY_VLOOP4_GENERIC(vtype,JIT_OP_ALTIVEC_BINARY_VLOOP4_WORK_OP_BOOL_MASK_T1(vtype,voperator)); \ 00423 JIT_OP_ALTIVEC_BINARY_VLOOP1_GENERIC(vtype,JIT_OP_ALTIVEC_BINARY_VLOOP1_WORK_OP_BOOL_MASK_T1(vtype,voperator)); \ 00424 JIT_OP_ALTIVEC_BINARY_SLOOP(vtype,soperator); 00425 00426 00427 // top level macros 00428 #define JIT_OP_BINARY_BIG_SIMD(vtype,soperator,vfunction) \ 00429 JIT_OP_BINARY_BIG_SIMD_BEGIN(vtype,soperator) \ 00430 vfunction (n, vecdata, ip0, ip1, op); \ 00431 JIT_OP_BINARY_BIG_SIMD_END(vtype,soperator) 00432 00433 #define JIT_OP_UNARY_BIG_SIMD(vtype,soperator,vfunction) \ 00434 JIT_OP_UNARY_BIG_SIMD_BEGIN(vtype,soperator) \ 00435 vfunction (n, vecdata, ip0, op); \ 00436 JIT_OP_UNARY_BIG_SIMD_END(vtype,soperator) 00437 00438 #if (JIT_OP_CHAR_USE_SIMD) 00439 #define JIT_OP_BINARY_BIG_SIMD_CHAR(soperator,vfunction) \ 00440 JIT_OP_BINARY_BIG_SIMD(unsigned char,soperator,vfunction) 00441 #define JIT_OP_UNARY_BIG_SIMD_CHAR(soperator,vfunction) \ 00442 JIT_OP_UNARY_BIG_SIMD(unsigned char,soperator,vfunction) 00443 #else 00444 #define JIT_OP_BINARY_BIG_SIMD_CHAR(soperator,vfunction) \ 00445 JIT_OP_BINARY_BIG_NO_SIMD(unsigned char,soperator) 00446 #define JIT_OP_UNARY_BIG_SIMD_CHAR(soperator,vfunction) \ 00447 JIT_OP_UNARY_BIG_NO_SIMD(unsigned char,soperator) 00448 #endif //JIT_OP_CHAR_USE_SIMD 00449 00450 #if (JIT_OP_LONG_USE_SIMD) 00451 #define JIT_OP_BINARY_BIG_SIMD_LONG(soperator,vfunction) \ 00452 JIT_OP_BINARY_BIG_SIMD(signed int,soperator,vfunction) 00453 #define JIT_OP_UNARY_BIG_SIMD_LONG(soperator,vfunction) \ 00454 JIT_OP_UNARY_BIG_SIMD(signed int,soperator,vfunction) 00455 #define JIT_OP_BINARY_BIG_SIMD_ULONG(soperator,vfunction) \ 00456 JIT_OP_BINARY_BIG_SIMD(unsigned int,soperator,vfunction) 00457 #define JIT_OP_UNARY_BIG_SIMD_ULONG(soperator,vfunction) \ 00458 JIT_OP_UNARY_BIG_SIMD(unsigned int,soperator,vfunction) 00459 #else 00460 #define JIT_OP_BINARY_BIG_SIMD_LONG(soperator,vfunction) \ 00461 JIT_OP_BINARY_BIG_NO_SIMD(signed int,soperator) 00462 #define JIT_OP_UNARY_BIG_SIMD_LONG(soperator,vfunction) \ 00463 JIT_OP_UNARY_BIG_NO_SIMD(signed int,soperator) 00464 #define JIT_OP_BINARY_BIG_SIMD_ULONG(soperator,vfunction) \ 00465 JIT_OP_BINARY_BIG_NO_SIMD(unsigned int,soperator) 00466 #define JIT_OP_UNARY_BIG_SIMD_ULONG(soperator,vfunction) \ 00467 JIT_OP_UNARY_BIG_NO_SIMD(unsigned int,soperator) 00468 #endif //JIT_OP_LONG_USE_SIMD 00469 00470 #if (JIT_OP_FLOAT32_USE_SIMD) 00471 #define JIT_OP_BINARY_BIG_SIMD_FLOAT32(soperator,vfunction) \ 00472 JIT_OP_BINARY_BIG_SIMD(float,soperator,vfunction) 00473 #define JIT_OP_UNARY_BIG_SIMD_FLOAT32(soperator,vfunction) \ 00474 JIT_OP_UNARY_BIG_SIMD(float,soperator,vfunction) 00475 #else 00476 #define JIT_OP_BINARY_BIG_SIMD_FLOAT32(soperator,vfunction) \ 00477 JIT_OP_BINARY_BIG_NO_SIMD(float,soperator) 00478 #define JIT_OP_UNARY_BIG_SIMD_FLOAT32(soperator,vfunction) \ 00479 JIT_OP_UNARY_BIG_NO_SIMD(float,soperator) 00480 #endif //JIT_OP_FLOAT32_USE_SIMD 00481 00482 00483 // ------------------------------------------------------------ 00484 // Above macros abstracted from code like the following 00485 /* 00486 00487 #define OpAbsFloat32(x) ((x)<0?-(x):(x)) 00488 00489 void jit_op_vector_abs_float32_altivec(long n, void *vecdata, float *ip0, float *op) 00490 { 00491 vector float *v_ip0=(vector float *)ip0; 00492 vector float *v_op=(vector float *)op; 00493 long v_steps,v_steps2,i; 00494 vector float a0; 00495 vector float a1; 00496 vector float a2; 00497 vector float a3; 00498 vector float d0; 00499 vector float d1; 00500 vector float d2; 00501 vector float d3; 00502 UInt32 prefetchSize = AltivecGetPrefetchConstant(16, 1, 256); 00503 00504 v_steps = (n/4); 00505 v_steps2 = v_steps/4; 00506 n = (n%4); 00507 //for later 00508 ip0 += v_steps*4; 00509 op += v_steps*4; 00510 v_steps %= 4; 00511 00512 for (i=0;i<v_steps2;i++) { 00513 vec_dstt( v_ip0, prefetchSize, 0 ); 00514 00515 a0 = vec_ldl(0, v_ip0 ); 00516 a1 = vec_ldl(1 * sizeof(vector float), v_ip0); 00517 a2 = vec_ldl(2 * sizeof(vector float), v_ip0); 00518 a3 = vec_ldl(3 * sizeof(vector float), v_ip0); 00519 00520 d0 = vec_abs(a0); 00521 d1 = vec_abs(a1); 00522 d2 = vec_abs(a2); 00523 d3 = vec_abs(a3); 00524 00525 vec_st( d0, 0, v_op ); 00526 vec_st( d1, 1 * sizeof(vector float), v_op ); 00527 vec_st( d2, 2 * sizeof(vector float), v_op ); 00528 vec_st( d3, 3 * sizeof(vector float), v_op ); 00529 v_ip0 += 4; 00530 v_op += 4; 00531 } 00532 for (i=0;i<v_steps;i++) { 00533 v_op[i] = vec_abs(v_ip0[i]); 00534 } 00535 while (n--) { 00536 *op++ = OpAbsFloat32(*ip0); ip0++; 00537 } 00538 } 00539 00540 with macros would be: 00541 void jit_op_vector_abs_float32_altivec(long n, void *vecdata, float *ip0, float *op) 00542 { 00543 JIT_OP_ALTIVEC_UNARY_BIG(float,vec_abs,OpAbsFloat32); 00544 } 00545 00546 00547 #define OpMaxFloat32(x,y) (((x)>(y))?(x):(y)) 00548 00549 void jit_op_vector_max_float32_altivec(long n, void *vecdata, float *ip0, float *ip1, float *op) 00550 { 00551 vector float *v_ip0=(vector float *)ip0; 00552 vector float *v_ip1=(vector float *)ip1; 00553 vector float *v_op=(vector float *)op; 00554 long v_steps,v_steps2,i; 00555 vector float a0; 00556 vector float a1; 00557 vector float a2; 00558 vector float a3; 00559 vector float b0; 00560 vector float b1; 00561 vector float b2; 00562 vector float b3; 00563 vector float d0; 00564 vector float d1; 00565 vector float d2; 00566 vector float d3; 00567 float sin0,sin1; 00568 UInt32 prefetchSize = AltivecGetPrefetchConstant(16, 1, 256); 00569 00570 v_steps = (n/4); 00571 v_steps2 = v_steps/4; 00572 n = (n%4); 00573 //for later 00574 ip0 += v_steps*4; 00575 ip1 += v_steps*4; 00576 op += v_steps*4; 00577 v_steps %= 4; 00578 00579 for (i=0;i<v_steps2;i++) { 00580 vec_dstt( v_ip0, prefetchSize, 0 ); 00581 vec_dstt( v_ip1, prefetchSize, 1 ); 00582 00583 a0 = vec_ldl(0, v_ip0 ); 00584 a1 = vec_ldl(1 * sizeof(vector float), v_ip0); 00585 a2 = vec_ldl(2 * sizeof(vector float), v_ip0); 00586 a3 = vec_ldl(3 * sizeof(vector float), v_ip0); 00587 00588 b0 = vec_ldl(0, v_ip1 ); 00589 b1 = vec_ldl(1 * sizeof(vector float), v_ip1); 00590 b2 = vec_ldl(2 * sizeof(vector float), v_ip1); 00591 b3 = vec_ldl(3 * sizeof(vector float), v_ip1); 00592 00593 d0 = vec_max(a0,b0); 00594 d1 = vec_max(a1,b1); 00595 d2 = vec_max(a2,b2); 00596 d3 = vec_max(a3,b3); 00597 00598 vec_st( d0, 0, v_op ); 00599 vec_st( d1, 1 * sizeof(vector float), v_op ); 00600 vec_st( d2, 2 * sizeof(vector float), v_op ); 00601 vec_st( d3, 3 * sizeof(vector float), v_op ); 00602 v_ip0 += 4; 00603 v_ip1 += 4; 00604 v_op += 4; 00605 } 00606 for (i=0;i<v_steps;i++) { 00607 v_op[i] = vec_max(v_ip0[i],v_ip1[i]); 00608 } 00609 while (n--) { 00610 sin0 = *ip0++; 00611 sin1 = *ip1++; 00612 *op++ = OpMaxFloat32(sin0,sin1); 00613 } 00614 } 00615 00616 with macros would be: 00617 void jit_op_vector_max_float32_altivec(long n, void *vecdata, float *ip0, float *ip1, float *op) 00618 { 00619 JIT_OP_ALTIVEC_BINARY_BIG(float,vec_max,OpMaxFloat32); 00620 } 00621 00622 */ 00623 00624 #endif //_JIT_OP_SIMD_H_
Copyright © 2008, Cycling '74