Botan
1.11.15
|
00001 /* 00002 * Lowest Level MPI Algorithms 00003 * (C) 1999-2010 Jack Lloyd 00004 * 2006 Luca Piccarreta 00005 * 00006 * Botan is released under the Simplified BSD License (see license.txt) 00007 */ 00008 00009 #ifndef BOTAN_MP_ASM_INTERNAL_H__ 00010 #define BOTAN_MP_ASM_INTERNAL_H__ 00011 00012 #include <botan/internal/mp_madd.h> 00013 00014 namespace Botan { 00015 00016 extern "C" { 00017 00018 /* 00019 * Word Addition 00020 */ 00021 inline word word_add(word x, word y, word* carry) 00022 { 00023 word z = x + y; 00024 word c1 = (z < x); 00025 z += *carry; 00026 *carry = c1 | (z < *carry); 00027 return z; 00028 } 00029 00030 /* 00031 * Eight Word Block Addition, Two Argument 00032 */ 00033 inline word word8_add2(word x[8], const word y[8], word carry) 00034 { 00035 __asm { 00036 mov edx,[x] 00037 mov esi,[y] 00038 xor eax,eax 00039 sub eax,[carry] //force CF=1 iff *carry==1 00040 mov eax,[esi] 00041 adc [edx],eax 00042 mov eax,[esi+4] 00043 adc [edx+4],eax 00044 mov eax,[esi+8] 00045 adc [edx+8],eax 00046 mov eax,[esi+12] 00047 adc [edx+12],eax 00048 mov eax,[esi+16] 00049 adc [edx+16],eax 00050 mov eax,[esi+20] 00051 adc [edx+20],eax 00052 mov eax,[esi+24] 00053 adc [edx+24],eax 00054 mov eax,[esi+28] 00055 adc [edx+28],eax 00056 sbb eax,eax 00057 neg eax 00058 } 00059 } 00060 00061 /* 00062 * Eight Word Block Addition, Three Argument 00063 */ 00064 inline word word8_add3(word z[8], const word x[8], const word y[8], word carry) 00065 { 00066 __asm { 00067 mov edi,[x] 00068 mov esi,[y] 00069 mov ebx,[z] 00070 xor eax,eax 00071 sub eax,[carry] //force CF=1 iff *carry==1 00072 mov eax,[edi] 00073 adc eax,[esi] 00074 mov [ebx],eax 00075 00076 mov eax,[edi+4] 00077 adc eax,[esi+4] 00078 mov [ebx+4],eax 00079 00080 mov eax,[edi+8] 00081 adc eax,[esi+8] 00082 mov [ebx+8],eax 00083 00084 mov eax,[edi+12] 00085 adc eax,[esi+12] 00086 mov [ebx+12],eax 00087 00088 mov eax,[edi+16] 00089 adc eax,[esi+16] 00090 mov [ebx+16],eax 00091 00092 mov eax,[edi+20] 00093 adc eax,[esi+20] 00094 mov [ebx+20],eax 00095 00096 mov eax,[edi+24] 00097 adc eax,[esi+24] 00098 mov [ebx+24],eax 00099 00100 mov eax,[edi+28] 00101 adc eax,[esi+28] 00102 mov [ebx+28],eax 00103 00104 sbb eax,eax 00105 neg eax 00106 } 00107 } 00108 00109 /* 00110 * Word Subtraction 00111 */ 00112 inline word word_sub(word x, word y, word* carry) 00113 { 00114 word t0 = x - y; 00115 word c1 = (t0 > x); 00116 word z = t0 - *carry; 00117 *carry = c1 | (z > t0); 00118 return z; 00119 } 00120 00121 /* 00122 * Eight Word Block Subtraction, Two Argument 00123 */ 00124 inline word word8_sub2(word x[8], const word y[8], word carry) 00125 { 00126 __asm { 00127 mov edi,[x] 00128 mov esi,[y] 00129 xor eax,eax 00130 sub eax,[carry] //force CF=1 iff *carry==1 00131 mov eax,[edi] 00132 sbb eax,[esi] 00133 mov [edi],eax 00134 mov eax,[edi+4] 00135 sbb eax,[esi+4] 00136 mov [edi+4],eax 00137 mov eax,[edi+8] 00138 sbb eax,[esi+8] 00139 mov [edi+8],eax 00140 mov eax,[edi+12] 00141 sbb eax,[esi+12] 00142 mov [edi+12],eax 00143 mov eax,[edi+16] 00144 sbb eax,[esi+16] 00145 mov [edi+16],eax 00146 mov eax,[edi+20] 00147 sbb eax,[esi+20] 00148 mov [edi+20],eax 00149 mov eax,[edi+24] 00150 sbb eax,[esi+24] 00151 mov [edi+24],eax 00152 mov eax,[edi+28] 00153 sbb eax,[esi+28] 00154 mov [edi+28],eax 00155 sbb eax,eax 00156 neg eax 00157 } 00158 } 00159 00160 /* 00161 * Eight Word Block Subtraction, Two Argument 00162 */ 00163 inline word word8_sub2_rev(word x[8], const word y[8], word carry) 00164 { 00165 x[0] = word_sub(y[0], x[0], &carry); 00166 x[1] = word_sub(y[1], x[1], &carry); 00167 x[2] = word_sub(y[2], x[2], &carry); 00168 x[3] = word_sub(y[3], x[3], &carry); 00169 x[4] = word_sub(y[4], x[4], &carry); 00170 x[5] = word_sub(y[5], x[5], &carry); 00171 x[6] = word_sub(y[6], x[6], &carry); 00172 x[7] = word_sub(y[7], x[7], &carry); 00173 return carry; 00174 } 00175 00176 00177 /* 00178 * Eight Word Block Subtraction, Three Argument 00179 */ 00180 inline word word8_sub3(word z[8], const word x[8], 00181 const word y[8], word carry) 00182 { 00183 __asm { 00184 mov edi,[x] 00185 mov esi,[y] 00186 xor eax,eax 00187 sub eax,[carry] //force CF=1 iff *carry==1 00188 mov ebx,[z] 00189 mov eax,[edi] 00190 sbb eax,[esi] 00191 mov [ebx],eax 00192 mov eax,[edi+4] 00193 sbb eax,[esi+4] 00194 mov [ebx+4],eax 00195 mov eax,[edi+8] 00196 sbb eax,[esi+8] 00197 mov [ebx+8],eax 00198 mov eax,[edi+12] 00199 sbb eax,[esi+12] 00200 mov [ebx+12],eax 00201 mov eax,[edi+16] 00202 sbb eax,[esi+16] 00203 mov [ebx+16],eax 00204 mov eax,[edi+20] 00205 sbb eax,[esi+20] 00206 mov [ebx+20],eax 00207 mov eax,[edi+24] 00208 sbb eax,[esi+24] 00209 mov [ebx+24],eax 00210 mov eax,[edi+28] 00211 sbb eax,[esi+28] 00212 mov [ebx+28],eax 00213 sbb eax,eax 00214 neg eax 00215 } 00216 } 00217 00218 /* 00219 * Eight Word Block Linear Multiplication 00220 */ 00221 inline word word8_linmul2(word x[8], word y, word carry) 00222 { 00223 __asm { 00224 mov esi,[x] 00225 mov eax,[esi] //load a 00226 mul [y] //edx(hi):eax(lo)=a*b 00227 add eax,[carry] //sum lo carry 00228 adc edx,0 //sum hi carry 00229 mov ecx,edx //store carry 00230 mov [esi],eax //load a 00231 00232 mov eax,[esi+4] //load a 00233 mul [y] //edx(hi):eax(lo)=a*b 00234 add eax,ecx //sum lo carry 00235 adc edx,0 //sum hi carry 00236 mov ecx,edx //store carry 00237 mov [esi+4],eax //load a 00238 00239 mov eax,[esi+8] //load a 00240 mul [y] //edx(hi):eax(lo)=a*b 00241 add eax,ecx //sum lo carry 00242 adc edx,0 //sum hi carry 00243 mov ecx,edx //store carry 00244 mov [esi+8],eax //load a 00245 00246 mov eax,[esi+12] //load a 00247 mul [y] //edx(hi):eax(lo)=a*b 00248 add eax,ecx //sum lo carry 00249 adc edx,0 //sum hi carry 00250 mov ecx,edx //store carry 00251 mov [esi+12],eax //load a 00252 00253 mov eax,[esi+16] //load a 00254 mul [y] //edx(hi):eax(lo)=a*b 00255 add eax,ecx //sum lo carry 00256 adc edx,0 //sum hi carry 00257 mov ecx,edx //store carry 00258 mov [esi+16],eax //load a 00259 00260 mov eax,[esi+20] //load a 00261 mul [y] //edx(hi):eax(lo)=a*b 00262 add eax,ecx //sum lo carry 00263 adc edx,0 //sum hi carry 00264 mov ecx,edx //store carry 00265 mov [esi+20],eax //load a 00266 00267 mov eax,[esi+24] //load a 00268 mul [y] //edx(hi):eax(lo)=a*b 00269 add eax,ecx //sum lo carry 00270 adc edx,0 //sum hi carry 00271 mov ecx,edx //store carry 00272 mov [esi+24],eax //load a 00273 00274 mov eax,[esi+28] //load a 00275 mul [y] //edx(hi):eax(lo)=a*b 00276 add eax,ecx //sum lo carry 00277 adc edx,0 //sum hi carry 00278 mov [esi+28],eax //load a 00279 00280 mov eax,edx //store carry 00281 } 00282 } 00283 00284 /* 00285 * Eight Word Block Linear Multiplication 00286 */ 00287 inline word word8_muladd(word z[8], const word x[8], 00288 word y, word carry) 00289 { 00290 __asm { 00291 mov esi,[x] 00292 mov ebx,[y] 00293 mov edi,[z] 00294 mov eax,[esi] //load a 00295 mul ebx //edx(hi):eax(lo)=a*b 00296 add eax,[carry] //sum lo carry 00297 adc edx,0 //sum hi carry 00298 add eax,[edi] //sum lo z 00299 adc edx,0 //sum hi z 00300 mov ecx,edx //carry for next block = hi z 00301 mov [edi],eax //save lo z 00302 00303 mov eax,[esi+4] 00304 mul ebx 00305 add eax,ecx 00306 adc edx,0 00307 add eax,[edi+4] 00308 adc edx,0 00309 mov ecx,edx 00310 mov [edi+4],eax 00311 00312 mov eax,[esi+8] 00313 mul ebx 00314 add eax,ecx 00315 adc edx,0 00316 add eax,[edi+8] 00317 adc edx,0 00318 mov ecx,edx 00319 mov [edi+8],eax 00320 00321 mov eax,[esi+12] 00322 mul ebx 00323 add eax,ecx 00324 adc edx,0 00325 add eax,[edi+12] 00326 adc edx,0 00327 mov ecx,edx 00328 mov [edi+12],eax 00329 00330 mov eax,[esi+16] 00331 mul ebx 00332 add eax,ecx 00333 adc edx,0 00334 add eax,[edi+16] 00335 adc edx,0 00336 mov ecx,edx 00337 mov [edi+16],eax 00338 00339 mov eax,[esi+20] 00340 mul ebx 00341 add eax,ecx 00342 adc edx,0 00343 add eax,[edi+20] 00344 adc edx,0 00345 mov ecx,edx 00346 mov [edi+20],eax 00347 00348 mov eax,[esi+24] 00349 mul ebx 00350 add eax,ecx 00351 adc edx,0 00352 add eax,[edi+24] 00353 adc edx,0 00354 mov ecx,edx 00355 mov [edi+24],eax 00356 00357 mov eax,[esi+28] 00358 mul ebx 00359 add eax,ecx 00360 adc edx,0 00361 add eax,[edi+28] 00362 adc edx,0 00363 mov [edi+28],eax 00364 mov eax,edx 00365 } 00366 } 00367 00368 inline word word8_linmul3(word z[4], const word x[4], word y, word carry) 00369 { 00370 __asm { 00371 #if 0 00372 //it's slower!!! 00373 mov edx,[z] 00374 mov eax,[x] 00375 movd mm7,[y] 00376 00377 movd mm0,[eax] 00378 movd mm1,[eax+4] 00379 movd mm2,[eax+8] 00380 pmuludq mm0,mm7 00381 pmuludq mm1,mm7 00382 pmuludq mm2,mm7 00383 00384 movd mm6,[carry] 00385 paddq mm0,mm6 00386 movd [edx],mm0 00387 00388 psrlq mm0,32 00389 paddq mm1,mm0 00390 movd [edx+4],mm1 00391 00392 movd mm3,[eax+12] 00393 psrlq mm1,32 00394 paddq mm2,mm1 00395 movd [edx+8],mm2 00396 00397 pmuludq mm3,mm7 00398 movd mm4,[eax+16] 00399 psrlq mm2,32 00400 paddq mm3,mm2 00401 movd [edx+12],mm3 00402 00403 pmuludq mm4,mm7 00404 movd mm5,[eax+20] 00405 psrlq mm3,32 00406 paddq mm4,mm3 00407 movd [edx+16],mm4 00408 00409 pmuludq mm5,mm7 00410 movd mm0,[eax+24] 00411 psrlq mm4,32 00412 paddq mm5,mm4 00413 movd [edx+20],mm5 00414 00415 pmuludq mm0,mm7 00416 movd mm1,[eax+28] 00417 psrlq mm5,32 00418 paddq mm0,mm5 00419 movd [edx+24],mm0 00420 00421 pmuludq mm1,mm7 00422 psrlq mm0,32 00423 paddq mm1,mm0 00424 movd [edx+28],mm1 00425 psrlq mm1,32 00426 00427 movd eax,mm1 00428 emms 00429 #else 00430 mov edi,[z] 00431 mov esi,[x] 00432 mov eax,[esi] //load a 00433 mul [y] //edx(hi):eax(lo)=a*b 00434 add eax,[carry] //sum lo carry 00435 adc edx,0 //sum hi carry 00436 mov ecx,edx //store carry 00437 mov [edi],eax //load a 00438 00439 mov eax,[esi+4] //load a 00440 mul [y] //edx(hi):eax(lo)=a*b 00441 add eax,ecx //sum lo carry 00442 adc edx,0 //sum hi carry 00443 mov ecx,edx //store carry 00444 mov [edi+4],eax //load a 00445 00446 mov eax,[esi+8] //load a 00447 mul [y] //edx(hi):eax(lo)=a*b 00448 add eax,ecx //sum lo carry 00449 adc edx,0 //sum hi carry 00450 mov ecx,edx //store carry 00451 mov [edi+8],eax //load a 00452 00453 mov eax,[esi+12] //load a 00454 mul [y] //edx(hi):eax(lo)=a*b 00455 add eax,ecx //sum lo carry 00456 adc edx,0 //sum hi carry 00457 mov ecx,edx //store carry 00458 mov [edi+12],eax //load a 00459 00460 mov eax,[esi+16] //load a 00461 mul [y] //edx(hi):eax(lo)=a*b 00462 add eax,ecx //sum lo carry 00463 adc edx,0 //sum hi carry 00464 mov ecx,edx //store carry 00465 mov [edi+16],eax //load a 00466 00467 mov eax,[esi+20] //load a 00468 mul [y] //edx(hi):eax(lo)=a*b 00469 add eax,ecx //sum lo carry 00470 adc edx,0 //sum hi carry 00471 mov ecx,edx //store carry 00472 mov [edi+20],eax //load a 00473 00474 mov eax,[esi+24] //load a 00475 mul [y] //edx(hi):eax(lo)=a*b 00476 add eax,ecx //sum lo carry 00477 adc edx,0 //sum hi carry 00478 mov ecx,edx //store carry 00479 mov [edi+24],eax //load a 00480 00481 mov eax,[esi+28] //load a 00482 mul [y] //edx(hi):eax(lo)=a*b 00483 add eax,ecx //sum lo carry 00484 adc edx,0 //sum hi carry 00485 mov [edi+28],eax //load a 00486 mov eax,edx //store carry 00487 #endif 00488 } 00489 } 00490 00491 /* 00492 * Eight Word Block Multiply/Add 00493 */ 00494 inline word word8_madd3(word z[8], const word x[8], word y, word carry) 00495 { 00496 z[0] = word_madd3(x[0], y, z[0], &carry); 00497 z[1] = word_madd3(x[1], y, z[1], &carry); 00498 z[2] = word_madd3(x[2], y, z[2], &carry); 00499 z[3] = word_madd3(x[3], y, z[3], &carry); 00500 z[4] = word_madd3(x[4], y, z[4], &carry); 00501 z[5] = word_madd3(x[5], y, z[5], &carry); 00502 z[6] = word_madd3(x[6], y, z[6], &carry); 00503 z[7] = word_madd3(x[7], y, z[7], &carry); 00504 return carry; 00505 } 00506 00507 /* 00508 * Multiply-Add Accumulator 00509 */ 00510 inline void word3_muladd(word* w2, word* w1, word* w0, word a, word b) 00511 { 00512 word carry = *w0; 00513 *w0 = word_madd2(a, b, &carry); 00514 *w1 += carry; 00515 *w2 += (*w1 < carry) ? 1 : 0; 00516 } 00517 00518 /* 00519 * Multiply-Add Accumulator 00520 */ 00521 inline void word3_muladd_2(word* w2, word* w1, word* w0, word a, word b) 00522 { 00523 word carry = 0; 00524 a = word_madd2(a, b, &carry); 00525 b = carry; 00526 00527 word top = (b >> (BOTAN_MP_WORD_BITS-1)); 00528 b <<= 1; 00529 b |= (a >> (BOTAN_MP_WORD_BITS-1)); 00530 a <<= 1; 00531 00532 carry = 0; 00533 *w0 = word_add(*w0, a, &carry); 00534 *w1 = word_add(*w1, b, &carry); 00535 *w2 = word_add(*w2, top, &carry); 00536 } 00537 00538 } 00539 00540 } 00541 00542 #endif