Botan  1.11.15
src/lib/math/mp/mp_x86_32_msvc/mp_asmi.h
Go to the documentation of this file.
00001 /*
00002 * Lowest Level MPI Algorithms
00003 * (C) 1999-2010 Jack Lloyd
00004 *     2006 Luca Piccarreta
00005 *
00006 * Botan is released under the Simplified BSD License (see license.txt)
00007 */
00008 
00009 #ifndef BOTAN_MP_ASM_INTERNAL_H__
00010 #define BOTAN_MP_ASM_INTERNAL_H__
00011 
00012 #include <botan/internal/mp_madd.h>
00013 
00014 namespace Botan {
00015 
00016 extern "C" {
00017 
00018 /*
00019 * Word Addition
00020 */
00021 inline word word_add(word x, word y, word* carry)
00022    {
00023    word z = x + y;
00024    word c1 = (z < x);
00025    z += *carry;
00026    *carry = c1 | (z < *carry);
00027    return z;
00028    }
00029 
00030 /*
00031 * Eight Word Block Addition, Two Argument
00032 */
00033 inline word word8_add2(word x[8], const word y[8], word carry)
00034    {
00035    __asm {
00036       mov edx,[x]
00037       mov esi,[y]
00038       xor eax,eax
00039       sub eax,[carry] //force CF=1 iff *carry==1
00040       mov eax,[esi]
00041       adc [edx],eax
00042       mov eax,[esi+4]
00043       adc [edx+4],eax
00044       mov eax,[esi+8]
00045       adc [edx+8],eax
00046       mov eax,[esi+12]
00047       adc [edx+12],eax
00048       mov eax,[esi+16]
00049       adc [edx+16],eax
00050       mov eax,[esi+20]
00051       adc [edx+20],eax
00052       mov eax,[esi+24]
00053       adc [edx+24],eax
00054       mov eax,[esi+28]
00055       adc [edx+28],eax
00056       sbb eax,eax
00057       neg eax
00058       }
00059    }
00060 
00061 /*
00062 * Eight Word Block Addition, Three Argument
00063 */
00064 inline word word8_add3(word z[8], const word x[8], const word y[8], word carry)
00065    {
00066     __asm {
00067       mov edi,[x]
00068       mov esi,[y]
00069       mov ebx,[z]
00070       xor eax,eax
00071       sub eax,[carry] //force CF=1 iff *carry==1
00072       mov eax,[edi]
00073       adc eax,[esi]
00074       mov [ebx],eax
00075 
00076       mov eax,[edi+4]
00077       adc eax,[esi+4]
00078       mov [ebx+4],eax
00079 
00080       mov eax,[edi+8]
00081       adc eax,[esi+8]
00082       mov [ebx+8],eax
00083 
00084       mov eax,[edi+12]
00085       adc eax,[esi+12]
00086       mov [ebx+12],eax
00087 
00088       mov eax,[edi+16]
00089       adc eax,[esi+16]
00090       mov [ebx+16],eax
00091 
00092       mov eax,[edi+20]
00093       adc eax,[esi+20]
00094       mov [ebx+20],eax
00095 
00096       mov eax,[edi+24]
00097       adc eax,[esi+24]
00098       mov [ebx+24],eax
00099 
00100       mov eax,[edi+28]
00101       adc eax,[esi+28]
00102       mov [ebx+28],eax
00103 
00104       sbb eax,eax
00105       neg eax
00106       }
00107    }
00108 
00109 /*
00110 * Word Subtraction
00111 */
00112 inline word word_sub(word x, word y, word* carry)
00113    {
00114    word t0 = x - y;
00115    word c1 = (t0 > x);
00116    word z = t0 - *carry;
00117    *carry = c1 | (z > t0);
00118    return z;
00119    }
00120 
00121 /*
00122 * Eight Word Block Subtraction, Two Argument
00123 */
00124 inline word word8_sub2(word x[8], const word y[8], word carry)
00125    {
00126     __asm {
00127       mov edi,[x]
00128       mov esi,[y]
00129       xor eax,eax
00130       sub eax,[carry] //force CF=1 iff *carry==1
00131       mov eax,[edi]
00132       sbb eax,[esi]
00133       mov [edi],eax
00134       mov eax,[edi+4]
00135       sbb eax,[esi+4]
00136       mov [edi+4],eax
00137       mov eax,[edi+8]
00138       sbb eax,[esi+8]
00139       mov [edi+8],eax
00140       mov eax,[edi+12]
00141       sbb eax,[esi+12]
00142       mov [edi+12],eax
00143       mov eax,[edi+16]
00144       sbb eax,[esi+16]
00145       mov [edi+16],eax
00146       mov eax,[edi+20]
00147       sbb eax,[esi+20]
00148       mov [edi+20],eax
00149       mov eax,[edi+24]
00150       sbb eax,[esi+24]
00151       mov [edi+24],eax
00152       mov eax,[edi+28]
00153       sbb eax,[esi+28]
00154       mov [edi+28],eax
00155       sbb eax,eax
00156       neg eax
00157       }
00158    }
00159 
00160 /*
00161 * Eight Word Block Subtraction, Two Argument
00162 */
00163 inline word word8_sub2_rev(word x[8], const word y[8], word carry)
00164    {
00165    x[0] = word_sub(y[0], x[0], &carry);
00166    x[1] = word_sub(y[1], x[1], &carry);
00167    x[2] = word_sub(y[2], x[2], &carry);
00168    x[3] = word_sub(y[3], x[3], &carry);
00169    x[4] = word_sub(y[4], x[4], &carry);
00170    x[5] = word_sub(y[5], x[5], &carry);
00171    x[6] = word_sub(y[6], x[6], &carry);
00172    x[7] = word_sub(y[7], x[7], &carry);
00173    return carry;
00174    }
00175 
00176 
00177 /*
00178 * Eight Word Block Subtraction, Three Argument
00179 */
00180 inline word word8_sub3(word z[8], const word x[8],
00181                        const word y[8], word carry)
00182    {
00183     __asm {
00184       mov edi,[x]
00185       mov esi,[y]
00186       xor eax,eax
00187       sub eax,[carry] //force CF=1 iff *carry==1
00188       mov ebx,[z]
00189       mov eax,[edi]
00190       sbb eax,[esi]
00191       mov [ebx],eax
00192       mov eax,[edi+4]
00193       sbb eax,[esi+4]
00194       mov [ebx+4],eax
00195       mov eax,[edi+8]
00196       sbb eax,[esi+8]
00197       mov [ebx+8],eax
00198       mov eax,[edi+12]
00199       sbb eax,[esi+12]
00200       mov [ebx+12],eax
00201       mov eax,[edi+16]
00202       sbb eax,[esi+16]
00203       mov [ebx+16],eax
00204       mov eax,[edi+20]
00205       sbb eax,[esi+20]
00206       mov [ebx+20],eax
00207       mov eax,[edi+24]
00208       sbb eax,[esi+24]
00209       mov [ebx+24],eax
00210       mov eax,[edi+28]
00211       sbb eax,[esi+28]
00212       mov [ebx+28],eax
00213       sbb eax,eax
00214       neg eax
00215       }
00216    }
00217 
00218 /*
00219 * Eight Word Block Linear Multiplication
00220 */
00221 inline word word8_linmul2(word x[8], word y, word carry)
00222    {
00223    __asm {
00224       mov esi,[x]
00225       mov eax,[esi]        //load a
00226       mul [y]           //edx(hi):eax(lo)=a*b
00227       add eax,[carry]      //sum lo carry
00228       adc edx,0          //sum hi carry
00229       mov ecx,edx      //store carry
00230       mov [esi],eax        //load a
00231 
00232       mov eax,[esi+4]        //load a
00233       mul [y]           //edx(hi):eax(lo)=a*b
00234       add eax,ecx      //sum lo carry
00235       adc edx,0          //sum hi carry
00236       mov ecx,edx      //store carry
00237       mov [esi+4],eax        //load a
00238 
00239       mov eax,[esi+8]        //load a
00240       mul [y]           //edx(hi):eax(lo)=a*b
00241       add eax,ecx      //sum lo carry
00242       adc edx,0          //sum hi carry
00243       mov ecx,edx      //store carry
00244       mov [esi+8],eax        //load a
00245 
00246       mov eax,[esi+12]        //load a
00247       mul [y]           //edx(hi):eax(lo)=a*b
00248       add eax,ecx      //sum lo carry
00249       adc edx,0          //sum hi carry
00250       mov ecx,edx      //store carry
00251       mov [esi+12],eax        //load a
00252 
00253       mov eax,[esi+16]        //load a
00254       mul [y]           //edx(hi):eax(lo)=a*b
00255       add eax,ecx      //sum lo carry
00256       adc edx,0          //sum hi carry
00257       mov ecx,edx      //store carry
00258       mov [esi+16],eax        //load a
00259 
00260       mov eax,[esi+20]        //load a
00261       mul [y]           //edx(hi):eax(lo)=a*b
00262       add eax,ecx      //sum lo carry
00263       adc edx,0          //sum hi carry
00264       mov ecx,edx      //store carry
00265       mov [esi+20],eax        //load a
00266 
00267       mov eax,[esi+24]        //load a
00268       mul [y]           //edx(hi):eax(lo)=a*b
00269       add eax,ecx      //sum lo carry
00270       adc edx,0          //sum hi carry
00271       mov ecx,edx      //store carry
00272       mov [esi+24],eax        //load a
00273 
00274       mov eax,[esi+28]        //load a
00275       mul [y]           //edx(hi):eax(lo)=a*b
00276       add eax,ecx      //sum lo carry
00277       adc edx,0          //sum hi carry
00278       mov [esi+28],eax        //load a
00279 
00280       mov eax,edx      //store carry
00281       }
00282    }
00283 
00284 /*
00285 * Eight Word Block Linear Multiplication
00286 */
00287 inline word word8_muladd(word z[8], const word x[8],
00288                          word y, word carry)
00289    {
00290    __asm {
00291       mov esi,[x]
00292       mov ebx,[y]
00293       mov edi,[z]
00294       mov eax,[esi]     //load a
00295       mul ebx           //edx(hi):eax(lo)=a*b
00296       add eax,[carry]   //sum lo carry
00297       adc edx,0         //sum hi carry
00298       add eax,[edi]     //sum lo z
00299       adc edx,0         //sum hi z
00300       mov ecx,edx       //carry for next block = hi z
00301       mov [edi],eax     //save lo z
00302 
00303       mov eax,[esi+4]
00304       mul ebx
00305       add eax,ecx
00306       adc edx,0
00307       add eax,[edi+4]
00308       adc edx,0
00309       mov ecx,edx
00310       mov [edi+4],eax
00311 
00312       mov eax,[esi+8]
00313       mul ebx
00314       add eax,ecx
00315       adc edx,0
00316       add eax,[edi+8]
00317       adc edx,0
00318       mov ecx,edx
00319       mov [edi+8],eax
00320 
00321       mov eax,[esi+12]
00322       mul ebx
00323       add eax,ecx
00324       adc edx,0
00325       add eax,[edi+12]
00326       adc edx,0
00327       mov ecx,edx
00328       mov [edi+12],eax
00329 
00330       mov eax,[esi+16]
00331       mul ebx
00332       add eax,ecx
00333       adc edx,0
00334       add eax,[edi+16]
00335       adc edx,0
00336       mov ecx,edx
00337       mov [edi+16],eax
00338 
00339       mov eax,[esi+20]
00340       mul ebx
00341       add eax,ecx
00342       adc edx,0
00343       add eax,[edi+20]
00344       adc edx,0
00345       mov ecx,edx
00346       mov [edi+20],eax
00347 
00348       mov eax,[esi+24]
00349       mul ebx
00350       add eax,ecx
00351       adc edx,0
00352       add eax,[edi+24]
00353       adc edx,0
00354       mov ecx,edx
00355       mov [edi+24],eax
00356 
00357       mov eax,[esi+28]
00358       mul ebx
00359       add eax,ecx
00360       adc edx,0
00361       add eax,[edi+28]
00362       adc edx,0
00363       mov [edi+28],eax
00364       mov eax,edx
00365       }
00366    }
00367 
00368 inline word word8_linmul3(word z[4], const word x[4], word y, word carry)
00369    {
00370    __asm {
00371 #if 0
00372       //it's slower!!!
00373       mov edx,[z]
00374       mov eax,[x]
00375       movd mm7,[y]
00376 
00377       movd mm0,[eax]
00378       movd mm1,[eax+4]
00379       movd mm2,[eax+8]
00380       pmuludq mm0,mm7
00381       pmuludq mm1,mm7
00382       pmuludq mm2,mm7
00383 
00384       movd mm6,[carry]
00385       paddq mm0,mm6
00386       movd [edx],mm0
00387 
00388       psrlq mm0,32
00389       paddq mm1,mm0
00390       movd [edx+4],mm1
00391 
00392       movd mm3,[eax+12]
00393       psrlq mm1,32
00394       paddq mm2,mm1
00395       movd [edx+8],mm2
00396 
00397       pmuludq mm3,mm7
00398       movd mm4,[eax+16]
00399       psrlq mm2,32
00400       paddq mm3,mm2
00401       movd [edx+12],mm3
00402 
00403       pmuludq mm4,mm7
00404       movd mm5,[eax+20]
00405       psrlq mm3,32
00406       paddq mm4,mm3
00407       movd [edx+16],mm4
00408 
00409       pmuludq mm5,mm7
00410       movd mm0,[eax+24]
00411       psrlq mm4,32
00412       paddq mm5,mm4
00413       movd [edx+20],mm5
00414 
00415       pmuludq mm0,mm7
00416       movd mm1,[eax+28]
00417       psrlq mm5,32
00418       paddq mm0,mm5
00419       movd [edx+24],mm0
00420 
00421       pmuludq mm1,mm7
00422       psrlq mm0,32
00423       paddq mm1,mm0
00424       movd [edx+28],mm1
00425       psrlq mm1,32
00426 
00427       movd eax,mm1
00428       emms
00429 #else
00430       mov edi,[z]
00431       mov esi,[x]
00432       mov eax,[esi]        //load a
00433       mul [y]           //edx(hi):eax(lo)=a*b
00434       add eax,[carry]    //sum lo carry
00435       adc edx,0          //sum hi carry
00436       mov ecx,edx      //store carry
00437       mov [edi],eax        //load a
00438 
00439       mov eax,[esi+4]        //load a
00440       mul [y]           //edx(hi):eax(lo)=a*b
00441       add eax,ecx      //sum lo carry
00442       adc edx,0          //sum hi carry
00443       mov ecx,edx      //store carry
00444       mov [edi+4],eax        //load a
00445 
00446       mov eax,[esi+8]        //load a
00447       mul [y]           //edx(hi):eax(lo)=a*b
00448       add eax,ecx      //sum lo carry
00449       adc edx,0          //sum hi carry
00450       mov ecx,edx      //store carry
00451       mov [edi+8],eax        //load a
00452 
00453       mov eax,[esi+12]        //load a
00454       mul [y]           //edx(hi):eax(lo)=a*b
00455       add eax,ecx      //sum lo carry
00456       adc edx,0          //sum hi carry
00457       mov ecx,edx      //store carry
00458       mov [edi+12],eax        //load a
00459 
00460       mov eax,[esi+16]        //load a
00461       mul [y]           //edx(hi):eax(lo)=a*b
00462       add eax,ecx      //sum lo carry
00463       adc edx,0          //sum hi carry
00464       mov ecx,edx      //store carry
00465       mov [edi+16],eax        //load a
00466 
00467       mov eax,[esi+20]        //load a
00468       mul [y]           //edx(hi):eax(lo)=a*b
00469       add eax,ecx      //sum lo carry
00470       adc edx,0          //sum hi carry
00471       mov ecx,edx      //store carry
00472       mov [edi+20],eax        //load a
00473 
00474       mov eax,[esi+24]        //load a
00475       mul [y]           //edx(hi):eax(lo)=a*b
00476       add eax,ecx      //sum lo carry
00477       adc edx,0          //sum hi carry
00478       mov ecx,edx      //store carry
00479       mov [edi+24],eax        //load a
00480 
00481       mov eax,[esi+28]        //load a
00482       mul [y]           //edx(hi):eax(lo)=a*b
00483       add eax,ecx      //sum lo carry
00484       adc edx,0          //sum hi carry
00485       mov [edi+28],eax        //load a
00486       mov eax,edx      //store carry
00487 #endif
00488       }
00489    }
00490 
00491 /*
00492 * Eight Word Block Multiply/Add
00493 */
00494 inline word word8_madd3(word z[8], const word x[8], word y, word carry)
00495    {
00496    z[0] = word_madd3(x[0], y, z[0], &carry);
00497    z[1] = word_madd3(x[1], y, z[1], &carry);
00498    z[2] = word_madd3(x[2], y, z[2], &carry);
00499    z[3] = word_madd3(x[3], y, z[3], &carry);
00500    z[4] = word_madd3(x[4], y, z[4], &carry);
00501    z[5] = word_madd3(x[5], y, z[5], &carry);
00502    z[6] = word_madd3(x[6], y, z[6], &carry);
00503    z[7] = word_madd3(x[7], y, z[7], &carry);
00504    return carry;
00505    }
00506 
00507 /*
00508 * Multiply-Add Accumulator
00509 */
00510 inline void word3_muladd(word* w2, word* w1, word* w0, word a, word b)
00511    {
00512    word carry = *w0;
00513    *w0 = word_madd2(a, b, &carry);
00514    *w1 += carry;
00515    *w2 += (*w1 < carry) ? 1 : 0;
00516    }
00517 
00518 /*
00519 * Multiply-Add Accumulator
00520 */
00521 inline void word3_muladd_2(word* w2, word* w1, word* w0, word a, word b)
00522    {
00523    word carry = 0;
00524    a = word_madd2(a, b, &carry);
00525    b = carry;
00526 
00527    word top = (b >> (BOTAN_MP_WORD_BITS-1));
00528    b <<= 1;
00529    b |= (a >> (BOTAN_MP_WORD_BITS-1));
00530    a <<= 1;
00531 
00532    carry = 0;
00533    *w0 = word_add(*w0, a, &carry);
00534    *w1 = word_add(*w1, b, &carry);
00535    *w2 = word_add(*w2, top, &carry);
00536    }
00537 
00538 }
00539 
00540 }
00541 
00542 #endif