From fee6aae442d86146063fa903626a95f7adcf8764 Mon Sep 17 00:00:00 2001 From: Adam Ierymenko Date: Tue, 9 Jul 2019 10:31:08 -0400 Subject: [PATCH] Drop in faster C25519 agreement code. --- .gitignore | 2 + attic/README.md | 4 - attic/lat_lon_to_xyz.js | 25 - attic/world/build.sh | 4 +- attic/world/mkworld.cpp | 21 +- attic/world/old/earth-2015-11-16.bin | Bin 494 -> 0 bytes attic/world/old/earth-2015-11-20.bin | Bin 792 -> 0 bytes attic/world/old/earth-2015-12-17.bin | Bin 732 -> 0 bytes attic/world/old/earth-2016-01-13.bin | Bin 634 -> 0 bytes attic/world/world.bin | Bin 634 -> 732 bytes attic/world/world.c | 4 +- node/C25519.cpp | 718 ++++++++++++++++++++++++++- 12 files changed, 733 insertions(+), 45 deletions(-) delete mode 100644 attic/README.md delete mode 100644 attic/lat_lon_to_xyz.js delete mode 100644 attic/world/old/earth-2015-11-16.bin delete mode 100644 attic/world/old/earth-2015-11-20.bin delete mode 100644 attic/world/old/earth-2015-12-17.bin delete mode 100644 attic/world/old/earth-2016-01-13.bin diff --git a/.gitignore b/.gitignore index 94ebeeb6e..44b5eb56b 100755 --- a/.gitignore +++ b/.gitignore @@ -118,3 +118,5 @@ ext/librethinkdbxx/build .vscode __pycache__ *~ +attic/world/*.c25519 +attic/world/mkworld diff --git a/attic/README.md b/attic/README.md deleted file mode 100644 index 768bccd4a..000000000 --- a/attic/README.md +++ /dev/null @@ -1,4 +0,0 @@ -Retired Code and Miscellaneous Junk -====== - -This directory is for old code that isn't used but we don't want to lose track of, and for anything else random like debug scripts. diff --git a/attic/lat_lon_to_xyz.js b/attic/lat_lon_to_xyz.js deleted file mode 100644 index 692a36873..000000000 --- a/attic/lat_lon_to_xyz.js +++ /dev/null @@ -1,25 +0,0 @@ -'use strict' - -/* This is a utility to convert latitude/longitude into X,Y,Z coordinates as used by clustering. */ - -if (process.argv.length !== 4) { - console.log('Usage: node lat_lon_to_xyz.js roots; const uint64_t id = ZT_WORLD_ID_EARTH; - const uint64_t ts = 1532555817048ULL; // July 25th, 2018 + const uint64_t ts = 1562631342273ULL; // July 8th, 2019 + + roots.push_back(World::Root()); + roots.back().identity = Identity("3a46f1bf30:0:76e66fab33e28549a62ee2064d1843273c2c300ba45c3f20bef02dbad225723bb59a9bb4b13535730961aeecf5a163ace477cceb0727025b99ac14a5166a09a3"); + roots.back().stableEndpoints.push_back(InetAddress("185.180.13.82/9993")); + roots.back().stableEndpoints.push_back(InetAddress("2a02:6ea0:c815::/9993")); // Alice roots.push_back(World::Root()); @@ -118,7 +123,7 @@ int main(int argc,char **argv) // END WORLD DEFINITION // ========================================================================= - fprintf(stderr,"INFO: generating and signing id==%llu ts==%llu"ZT_EOL_S,(unsigned long long)id,(unsigned long long)ts); + fprintf(stderr,"INFO: generating and signing id==%llu ts==%llu" ZT_EOL_S,(unsigned long long)id,(unsigned long long)ts); World nw = World::make(World::TYPE_PLANET,id,ts,currentKP.pub,roots,previousKP); @@ -127,15 +132,15 @@ int main(int argc,char **argv) World testw; testw.deserialize(outtmp,0); if (testw != nw) { - fprintf(stderr,"FATAL: serialization test failed!"ZT_EOL_S); + fprintf(stderr,"FATAL: serialization test failed!" ZT_EOL_S); return 1; } OSUtils::writeFile("world.bin",std::string((const char *)outtmp.data(),outtmp.size())); - fprintf(stderr,"INFO: world.bin written with %u bytes of binary world data."ZT_EOL_S,outtmp.size()); + fprintf(stderr,"INFO: world.bin written with %u bytes of binary world data." ZT_EOL_S,outtmp.size()); fprintf(stdout,ZT_EOL_S); - fprintf(stdout,"#define ZT_DEFAULT_WORLD_LENGTH %u"ZT_EOL_S,outtmp.size()); + fprintf(stdout,"#define ZT_DEFAULT_WORLD_LENGTH %u" ZT_EOL_S,outtmp.size()); fprintf(stdout,"static const unsigned char ZT_DEFAULT_WORLD[ZT_DEFAULT_WORLD_LENGTH] = {"); for(unsigned int i=0;i25mq)IuzEdXVj$;cS%fH9|`@&9L=L$f2 zm(Dh#Jq5~#$5|&_iw#V)yCyGp7hbnbxYM-hBVEj*aj4(%Bcrlrbn4~YjqlS6sjSP| zlf7)sLmj7>YTj_b#vHHk{eOzNb`#;=PxICj1b!KZYPtX_ZYv`glMi6)34d_V^sp$h z&22plW55)-1E2F#kmM18@)r9pZ>ZT^9_eOrRzVpB!`#p6c-Y3@uexY~aI^XV0R+a! zVcaJPh(SP#E&#h7G4jJB;$-ROL2|*TPj`uxRqYUU9Mx`yKwRDtjRN2PRb56b&uWFW zV;mAS42@mH3Azxu0$mS8T=YrVD;xj;1Z%%8(kBUu&jp|307x%x<=^Q7F`Gf1<-Nyc zAe;`(ql`^%PtOfAWrLWsQhm)L`7sjvoA1ys*NgWi8#6z_1<{hGBg&po{9vy49h^ zeD=521kGWe^taey*($}=E0%QrUAM%`a=$J^>a3F|BwoMZ`nu`#&bK*Z< z*2n(2ZeFeQW@W|7b2~4b-a4haQ!%{gTfxG+=@#c*T@UrTyt23 z^G_$P2Fj?htZ3k1U|0y22}uCyS!vmy0g_?=a6y)lAsniQ2`H1j>L?#n4+A5E0!W65 z;Q>fb!||gxfifx#i48zOhzqtm0A)HH9lG=wcFP)mI;{LC<>fQSqJzu*%R8rrz7;5y zxtiDJ5OYtkm+Ah$&{#K}^I5H1lVyZVd3s}yaPAV=$rQ`)67$9LrnU^wkGcxSnt(Pa zFiL(%VQ4%Y!ocwVpM-28&~Cld1wVl@VBa%C?and)xhi|V-X*A>42I?9k-XLItm=vIds z^V#286Euf?(%)i-WvdiduUOLgcij>%%l*0xsk2U=ka*p7>XU7u-XTlZw9PLqD>sH7 zS@~zrpV0MP`W7e7zGRASTUqp7jKOq!%gH}aeogECBDX7HeyhkKcJIFzb0jk_|KyxA z_wmkt!HUA=UaU9HBrIY}o9}+8a7ReP99J=cl^6DYXP&D#!SXW$9mqp`r@TKi(j?8uWeobEKz=o{fF(2 zYB#y^7{Unkpn4b>85BS= zObib|dK!)&y$O_2VMuHM3PN14-2o`m;pot%$FN)0@Y7-CM=39#ITjsU?qA+HHT11O zsm#^9Hiwvdg1t=l|Aof7>737M-I^>TWXjVUdxUeBz)q%EewUapo;S5+fPU0fIMxKT zL4i^7LkdIV;SdIf|NkUp6M=T?r7rjhlmYvm8ESWy0mxO^`}HnC^<*$KF9Eq12vUGD zx(eSmLuDcux_^L86Wu%uEb}8ADpSBP`y@o>$XB4u{L@T81>g|lSqrsCtpI3Gon(8~ zE(WbUZDonc{0Xl)>kBS?*`TrMY_2U&@&S>Z%=147Pk18O_(|-yUjB-kF>)`{3d0;F nSr6Yi|El7~vHR8w=a_o5G9jFbk@MSpq0cZmNi#QIk diff --git a/attic/world/old/earth-2016-01-13.bin b/attic/world/old/earth-2016-01-13.bin deleted file mode 100644 index 5dea4d215716ccf10c3a529c5d45ba69b8a0ec58..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 634 zcmZQ%00NFzC%G6H7=vt#0;G0q?pTtkBry49h^ zeD=521kGWe^taey*($}=E0%QrUAM%`a=$J^>a3F|Bwl-cZ(HGTE$G#{c~$#;rl+me z&0l{~+v@+@YX){lvQj_oZ4UbC@bV0AUZ?%61v~CLUD9uP#;nG_;_(?)R_6CY4ZP>< zwnbRWuULCr^`~>;@|ao2A1>a}aiK70%FJR%ZfTDern!m}EI%_aewd^+dVPVys%>+N_W639nbR|AN=0~~YR<)*PAhq(FJ8K{_*KjM+Sc{Y66Lqpf7tG* zc9ScQfrn+!vbc@voNQXm8xAlqF;Mg z#P9&5r{Va~n?M;AhQtP-AjAdR9e^?&jt*UV47+6wKOI(nl=AYKW6{Co{^gxhL*ELN z%3RHBbBMVo*voYPUudkG&iSm?t;sS%raZl|M>uy0>|~1NcZvDpc~e^k=to_JV@*IC z6c{Bxq%brd4q;&U|4%|T5ootw>Vls@8L;n}p>}5(fLxWmU+)rBPX)*yJxy_bS{HSbLoH%glc)9J3$HE0>+` zF1BitzkrzS@i&(z%)40fXY*F}Ilfbmig(*`CzQPi+`Y}VWUFFx^;B7=*=N_s*iE}{ zt`nB~tCas%Xshs-GO09Q>)!K1Crx(pCNf*OecW%rQ1&c;weh1?&t-a#*nB0N)opYP zxR=D(E4b|Ypu6jmYLWHUS+lonG&L>eOkDTo>%!zUPs-1{W>;s5p1DS3saO{0Vg@Fb zom+T=)H&I-nDQ2!5C!=K0@OJtZi=i2v1=I^)j3&aIUQ95D%0Wr5x^M2I323W7$`IU zbmD5Dj2g>|1`Y;>gwST=*ekqM$?K{=Zj^zKiBDBH;v z#2CEqOo*$S?ZPfZMzS&2otoH<%`iFhMT2j0W@T2A~i47?|KP z*{hE7LG=J_P+($sz{v3rtf%4l(VIXS6^2Bh9xjFp4BH*RQj@}1zDQC oU+)r7nF6EahYW`1B@8KNIT{$xa!l@Fa_3YlVEiX~> 32); + /* Set to all 1s if v was negative; else set to 0s. */ + const int32_t sign = ((int32_t) highword) >> 31; + /* Set to 0x3ffffff if v was negative; else set to 0. */ + const int32_t roundoff = ((uint32_t) sign) >> 6; + /* Should return v / (1<<26) */ + return (v + roundoff) >> 26; +} + +static inline limb div_by_2_25(const limb v) +{ + /* High word of v; no shift needed*/ + const uint32_t highword = (uint32_t) (((uint64_t) v) >> 32); + /* Set to all 1s if v was negative; else set to 0s. */ + const int32_t sign = ((int32_t) highword) >> 31; + /* Set to 0x1ffffff if v was negative; else set to 0. */ + const int32_t roundoff = ((uint32_t) sign) >> 7; + /* Should return v / (1<<25) */ + return (v + roundoff) >> 25; +} + +static inline void freduce_coefficients(limb *output) { + unsigned i; + + output[10] = 0; + + for (i = 0; i < 10; i += 2) { + limb over = div_by_2_26(output[i]); + /* The entry condition (that |output[i]| < 280*2^54) means that over is, at + * most, 280*2^28 in the first iteration of this loop. This is added to the + * next limb and we can approximate the resulting bound of that limb by + * 281*2^54. */ + output[i] -= over << 26; + output[i+1] += over; + + /* For the first iteration, |output[i+1]| < 281*2^54, thus |over| < + * 281*2^29. When this is added to the next limb, the resulting bound can + * be approximated as 281*2^54. + * + * For subsequent iterations of the loop, 281*2^54 remains a conservative + * bound and no overflow occurs. */ + over = div_by_2_25(output[i+1]); + output[i+1] -= over << 25; + output[i+2] += over; + } + /* Now |output[10]| < 281*2^29 and all other coefficients are reduced. */ + output[0] += output[10] << 4; + output[0] += output[10] << 1; + output[0] += output[10]; + + output[10] = 0; + + /* Now output[1..9] are reduced, and |output[0]| < 2^26 + 19*281*2^29 + * So |over| will be no more than 2^16. */ + { + limb over = div_by_2_26(output[0]); + output[0] -= over << 26; + output[1] += over; + } + + /* Now output[0,2..9] are reduced, and |output[1]| < 2^25 + 2^16 < 2^26. The + * bound on |output[1]| is sufficient to meet our needs. */ +} + +static inline void fmul(limb *output, const limb *in, const limb *in2) { + limb t[19]; + fproduct(t, in, in2); + /* |t[i]| < 14*2^54 */ + freduce_degree(t); + freduce_coefficients(t); + /* |t[i]| < 2^26 */ + memcpy(output, t, sizeof(limb) * 10); +} + +static inline void fsquare_inner(limb *output, const limb *in) { + output[0] = ((limb) ((s32) in[0])) * ((s32) in[0]); + output[1] = 2 * ((limb) ((s32) in[0])) * ((s32) in[1]); + output[2] = 2 * (((limb) ((s32) in[1])) * ((s32) in[1]) + + ((limb) ((s32) in[0])) * ((s32) in[2])); + output[3] = 2 * (((limb) ((s32) in[1])) * ((s32) in[2]) + + ((limb) ((s32) in[0])) * ((s32) in[3])); + output[4] = ((limb) ((s32) in[2])) * ((s32) in[2]) + + 4 * ((limb) ((s32) in[1])) * ((s32) in[3]) + + 2 * ((limb) ((s32) in[0])) * ((s32) in[4]); + output[5] = 2 * (((limb) ((s32) in[2])) * ((s32) in[3]) + + ((limb) ((s32) in[1])) * ((s32) in[4]) + + ((limb) ((s32) in[0])) * ((s32) in[5])); + output[6] = 2 * (((limb) ((s32) in[3])) * ((s32) in[3]) + + ((limb) ((s32) in[2])) * ((s32) in[4]) + + ((limb) ((s32) in[0])) * ((s32) in[6]) + + 2 * ((limb) ((s32) in[1])) * ((s32) in[5])); + output[7] = 2 * (((limb) ((s32) in[3])) * ((s32) in[4]) + + ((limb) ((s32) in[2])) * ((s32) in[5]) + + ((limb) ((s32) in[1])) * ((s32) in[6]) + + ((limb) ((s32) in[0])) * ((s32) in[7])); + output[8] = ((limb) ((s32) in[4])) * ((s32) in[4]) + + 2 * (((limb) ((s32) in[2])) * ((s32) in[6]) + + ((limb) ((s32) in[0])) * ((s32) in[8]) + + 2 * (((limb) ((s32) in[1])) * ((s32) in[7]) + + ((limb) ((s32) in[3])) * ((s32) in[5]))); + output[9] = 2 * (((limb) ((s32) in[4])) * ((s32) in[5]) + + ((limb) ((s32) in[3])) * ((s32) in[6]) + + ((limb) ((s32) in[2])) * ((s32) in[7]) + + ((limb) ((s32) in[1])) * ((s32) in[8]) + + ((limb) ((s32) in[0])) * ((s32) in[9])); + output[10] = 2 * (((limb) ((s32) in[5])) * ((s32) in[5]) + + ((limb) ((s32) in[4])) * ((s32) in[6]) + + ((limb) ((s32) in[2])) * ((s32) in[8]) + + 2 * (((limb) ((s32) in[3])) * ((s32) in[7]) + + ((limb) ((s32) in[1])) * ((s32) in[9]))); + output[11] = 2 * (((limb) ((s32) in[5])) * ((s32) in[6]) + + ((limb) ((s32) in[4])) * ((s32) in[7]) + + ((limb) ((s32) in[3])) * ((s32) in[8]) + + ((limb) ((s32) in[2])) * ((s32) in[9])); + output[12] = ((limb) ((s32) in[6])) * ((s32) in[6]) + + 2 * (((limb) ((s32) in[4])) * ((s32) in[8]) + + 2 * (((limb) ((s32) in[5])) * ((s32) in[7]) + + ((limb) ((s32) in[3])) * ((s32) in[9]))); + output[13] = 2 * (((limb) ((s32) in[6])) * ((s32) in[7]) + + ((limb) ((s32) in[5])) * ((s32) in[8]) + + ((limb) ((s32) in[4])) * ((s32) in[9])); + output[14] = 2 * (((limb) ((s32) in[7])) * ((s32) in[7]) + + ((limb) ((s32) in[6])) * ((s32) in[8]) + + 2 * ((limb) ((s32) in[5])) * ((s32) in[9])); + output[15] = 2 * (((limb) ((s32) in[7])) * ((s32) in[8]) + + ((limb) ((s32) in[6])) * ((s32) in[9])); + output[16] = ((limb) ((s32) in[8])) * ((s32) in[8]) + + 4 * ((limb) ((s32) in[7])) * ((s32) in[9]); + output[17] = 2 * ((limb) ((s32) in[8])) * ((s32) in[9]); + output[18] = 2 * ((limb) ((s32) in[9])) * ((s32) in[9]); +} + +static void fsquare(limb *output, const limb *in) { + limb t[19]; + fsquare_inner(t, in); + /* |t[i]| < 14*2^54 because the largest product of two limbs will be < + * 2^(27+27) and fsquare_inner adds together, at most, 14 of those + * products. */ + freduce_degree(t); + freduce_coefficients(t); + /* |t[i]| < 2^26 */ + memcpy(output, t, sizeof(limb) * 10); +} + +static inline void fexpand(limb *output, const u8 *input) { +#define F(n,start,shift,mask) \ + output[n] = ((((limb) input[start + 0]) | \ + ((limb) input[start + 1]) << 8 | \ + ((limb) input[start + 2]) << 16 | \ + ((limb) input[start + 3]) << 24) >> shift) & mask; + F(0, 0, 0, 0x3ffffff); + F(1, 3, 2, 0x1ffffff); + F(2, 6, 3, 0x3ffffff); + F(3, 9, 5, 0x1ffffff); + F(4, 12, 6, 0x3ffffff); + F(5, 16, 0, 0x1ffffff); + F(6, 19, 1, 0x3ffffff); + F(7, 22, 3, 0x1ffffff); + F(8, 25, 4, 0x3ffffff); + F(9, 28, 6, 0x1ffffff); +#undef F +} + +#if (-32 >> 1) != -16 +#error "This code only works when >> does sign-extension on negative numbers" +#endif + +static inline s32 s32_eq(s32 a, s32 b) { + a = ~(a ^ b); + a &= a << 16; + a &= a << 8; + a &= a << 4; + a &= a << 2; + a &= a << 1; + return a >> 31; +} + +static inline s32 s32_gte(s32 a, s32 b) { + a -= b; + /* a >= 0 iff a >= b. */ + return ~(a >> 31); +} + +static inline void fcontract(u8 *output, limb *input_limbs) { + int i; + int j; + s32 input[10]; + s32 mask; + + /* |input_limbs[i]| < 2^26, so it's valid to convert to an s32. */ + for (i = 0; i < 10; i++) { + input[i] = input_limbs[i]; + } + + for (j = 0; j < 2; ++j) { + for (i = 0; i < 9; ++i) { + if ((i & 1) == 1) { + /* This calculation is a time-invariant way to make input[i] + * non-negative by borrowing from the next-larger limb. */ + const s32 mask = input[i] >> 31; + const s32 carry = -((input[i] & mask) >> 25); + input[i] = input[i] + (carry << 25); + input[i+1] = input[i+1] - carry; + } else { + const s32 mask = input[i] >> 31; + const s32 carry = -((input[i] & mask) >> 26); + input[i] = input[i] + (carry << 26); + input[i+1] = input[i+1] - carry; + } + } + + /* There's no greater limb for input[9] to borrow from, but we can multiply + * by 19 and borrow from input[0], which is valid mod 2^255-19. */ + { + const s32 mask = input[9] >> 31; + const s32 carry = -((input[9] & mask) >> 25); + input[9] = input[9] + (carry << 25); + input[0] = input[0] - (carry * 19); + } + + /* After the first iteration, input[1..9] are non-negative and fit within + * 25 or 26 bits, depending on position. However, input[0] may be + * negative. */ + } + + /* The first borrow-propagation pass above ended with every limb + except (possibly) input[0] non-negative. + + If input[0] was negative after the first pass, then it was because of a + carry from input[9]. On entry, input[9] < 2^26 so the carry was, at most, + one, since (2**26-1) >> 25 = 1. Thus input[0] >= -19. + + In the second pass, each limb is decreased by at most one. Thus the second + borrow-propagation pass could only have wrapped around to decrease + input[0] again if the first pass left input[0] negative *and* input[1] + through input[9] were all zero. In that case, input[1] is now 2^25 - 1, + and this last borrow-propagation step will leave input[1] non-negative. */ + { + const s32 mask = input[0] >> 31; + const s32 carry = -((input[0] & mask) >> 26); + input[0] = input[0] + (carry << 26); + input[1] = input[1] - carry; + } + + /* All input[i] are now non-negative. However, there might be values between + * 2^25 and 2^26 in a limb which is, nominally, 25 bits wide. */ + for (j = 0; j < 2; j++) { + for (i = 0; i < 9; i++) { + if ((i & 1) == 1) { + const s32 carry = input[i] >> 25; + input[i] &= 0x1ffffff; + input[i+1] += carry; + } else { + const s32 carry = input[i] >> 26; + input[i] &= 0x3ffffff; + input[i+1] += carry; + } + } + + { + const s32 carry = input[9] >> 25; + input[9] &= 0x1ffffff; + input[0] += 19*carry; + } + } + + /* If the first carry-chain pass, just above, ended up with a carry from + * input[9], and that caused input[0] to be out-of-bounds, then input[0] was + * < 2^26 + 2*19, because the carry was, at most, two. + * + * If the second pass carried from input[9] again then input[0] is < 2*19 and + * the input[9] -> input[0] carry didn't push input[0] out of bounds. */ + + /* It still remains the case that input might be between 2^255-19 and 2^255. + * In this case, input[1..9] must take their maximum value and input[0] must + * be >= (2^255-19) & 0x3ffffff, which is 0x3ffffed. */ + mask = s32_gte(input[0], 0x3ffffed); + for (i = 1; i < 10; i++) { + if ((i & 1) == 1) { + mask &= s32_eq(input[i], 0x1ffffff); + } else { + mask &= s32_eq(input[i], 0x3ffffff); + } + } + + /* mask is either 0xffffffff (if input >= 2^255-19) and zero otherwise. Thus + * this conditionally subtracts 2^255-19. */ + input[0] -= mask & 0x3ffffed; + + for (i = 1; i < 10; i++) { + if ((i & 1) == 1) { + input[i] -= mask & 0x1ffffff; + } else { + input[i] -= mask & 0x3ffffff; + } + } + + input[1] <<= 2; + input[2] <<= 3; + input[3] <<= 5; + input[4] <<= 6; + input[6] <<= 1; + input[7] <<= 3; + input[8] <<= 4; + input[9] <<= 6; +#define F(i, s) \ + output[s+0] |= input[i] & 0xff; \ + output[s+1] = (input[i] >> 8) & 0xff; \ + output[s+2] = (input[i] >> 16) & 0xff; \ + output[s+3] = (input[i] >> 24) & 0xff; + output[0] = 0; + output[16] = 0; + F(0,0); + F(1,3); + F(2,6); + F(3,9); + F(4,12); + F(5,16); + F(6,19); + F(7,22); + F(8,25); + F(9,28); +#undef F +} + +static inline void fmonty(limb *x2, limb *z2, /* output 2Q */ + limb *x3, limb *z3, /* output Q + Q' */ + limb *x, limb *z, /* input Q */ + limb *xprime, limb *zprime, /* input Q' */ + const limb *qmqp /* input Q - Q' */) { + limb origx[10], origxprime[10], zzz[19], xx[19], zz[19], xxprime[19], + zzprime[19], zzzprime[19], xxxprime[19]; + + memcpy(origx, x, 10 * sizeof(limb)); + fsum(x, z); + /* |x[i]| < 2^27 */ + fdifference(z, origx); /* does x - z */ + /* |z[i]| < 2^27 */ + + memcpy(origxprime, xprime, sizeof(limb) * 10); + fsum(xprime, zprime); + /* |xprime[i]| < 2^27 */ + fdifference(zprime, origxprime); + /* |zprime[i]| < 2^27 */ + fproduct(xxprime, xprime, z); + /* |xxprime[i]| < 14*2^54: the largest product of two limbs will be < + * 2^(27+27) and fproduct adds together, at most, 14 of those products. + * (Approximating that to 2^58 doesn't work out.) */ + fproduct(zzprime, x, zprime); + /* |zzprime[i]| < 14*2^54 */ + freduce_degree(xxprime); + freduce_coefficients(xxprime); + /* |xxprime[i]| < 2^26 */ + freduce_degree(zzprime); + freduce_coefficients(zzprime); + /* |zzprime[i]| < 2^26 */ + memcpy(origxprime, xxprime, sizeof(limb) * 10); + fsum(xxprime, zzprime); + /* |xxprime[i]| < 2^27 */ + fdifference(zzprime, origxprime); + /* |zzprime[i]| < 2^27 */ + fsquare(xxxprime, xxprime); + /* |xxxprime[i]| < 2^26 */ + fsquare(zzzprime, zzprime); + /* |zzzprime[i]| < 2^26 */ + fproduct(zzprime, zzzprime, qmqp); + /* |zzprime[i]| < 14*2^52 */ + freduce_degree(zzprime); + freduce_coefficients(zzprime); + /* |zzprime[i]| < 2^26 */ + memcpy(x3, xxxprime, sizeof(limb) * 10); + memcpy(z3, zzprime, sizeof(limb) * 10); + + fsquare(xx, x); + /* |xx[i]| < 2^26 */ + fsquare(zz, z); + /* |zz[i]| < 2^26 */ + fproduct(x2, xx, zz); + /* |x2[i]| < 14*2^52 */ + freduce_degree(x2); + freduce_coefficients(x2); + /* |x2[i]| < 2^26 */ + fdifference(zz, xx); // does zz = xx - zz + /* |zz[i]| < 2^27 */ + memset(zzz + 10, 0, sizeof(limb) * 9); + fscalar_product(zzz, zz, 121665); + /* |zzz[i]| < 2^(27+17) */ + /* No need to call freduce_degree here: + fscalar_product doesn't increase the degree of its input. */ + freduce_coefficients(zzz); + /* |zzz[i]| < 2^26 */ + fsum(zzz, xx); + /* |zzz[i]| < 2^27 */ + fproduct(z2, zz, zzz); + /* |z2[i]| < 14*2^(26+27) */ + freduce_degree(z2); + freduce_coefficients(z2); + /* |z2|i| < 2^26 */ +} + +static inline void swap_conditional(limb a[19], limb b[19], limb iswap) { + unsigned i; + const s32 swap = (s32) -iswap; + + for (i = 0; i < 10; ++i) { + const s32 x = swap & ( ((s32)a[i]) ^ ((s32)b[i]) ); + a[i] = ((s32)a[i]) ^ x; + b[i] = ((s32)b[i]) ^ x; + } +} + +static inline void cmult(limb *resultx, limb *resultz, const u8 *n, const limb *q) { + limb a[19] = {0}, b[19] = {1}, c[19] = {1}, d[19] = {0}; + limb *nqpqx = a, *nqpqz = b, *nqx = c, *nqz = d, *t; + limb e[19] = {0}, f[19] = {1}, g[19] = {0}, h[19] = {1}; + limb *nqpqx2 = e, *nqpqz2 = f, *nqx2 = g, *nqz2 = h; + + unsigned i, j; + + memcpy(nqpqx, q, sizeof(limb) * 10); + + for (i = 0; i < 32; ++i) { + u8 byte = n[31 - i]; + for (j = 0; j < 8; ++j) { + const limb bit = byte >> 7; + + swap_conditional(nqx, nqpqx, bit); + swap_conditional(nqz, nqpqz, bit); + fmonty(nqx2, nqz2, + nqpqx2, nqpqz2, + nqx, nqz, + nqpqx, nqpqz, + q); + swap_conditional(nqx2, nqpqx2, bit); + swap_conditional(nqz2, nqpqz2, bit); + + t = nqx; + nqx = nqx2; + nqx2 = t; + t = nqz; + nqz = nqz2; + nqz2 = t; + t = nqpqx; + nqpqx = nqpqx2; + nqpqx2 = t; + t = nqpqz; + nqpqz = nqpqz2; + nqpqz2 = t; + + byte <<= 1; + } + } + + memcpy(resultx, nqx, sizeof(limb) * 10); + memcpy(resultz, nqz, sizeof(limb) * 10); +} + +static inline void crecip(limb *out, const limb *z) { + limb z2[10]; + limb z9[10]; + limb z11[10]; + limb z2_5_0[10]; + limb z2_10_0[10]; + limb z2_20_0[10]; + limb z2_50_0[10]; + limb z2_100_0[10]; + limb t0[10]; + limb t1[10]; + int i; + + /* 2 */ fsquare(z2,z); + /* 4 */ fsquare(t1,z2); + /* 8 */ fsquare(t0,t1); + /* 9 */ fmul(z9,t0,z); + /* 11 */ fmul(z11,z9,z2); + /* 22 */ fsquare(t0,z11); + /* 2^5 - 2^0 = 31 */ fmul(z2_5_0,t0,z9); + + /* 2^6 - 2^1 */ fsquare(t0,z2_5_0); + /* 2^7 - 2^2 */ fsquare(t1,t0); + /* 2^8 - 2^3 */ fsquare(t0,t1); + /* 2^9 - 2^4 */ fsquare(t1,t0); + /* 2^10 - 2^5 */ fsquare(t0,t1); + /* 2^10 - 2^0 */ fmul(z2_10_0,t0,z2_5_0); + + /* 2^11 - 2^1 */ fsquare(t0,z2_10_0); + /* 2^12 - 2^2 */ fsquare(t1,t0); + /* 2^20 - 2^10 */ for (i = 2;i < 10;i += 2) { fsquare(t0,t1); fsquare(t1,t0); } + /* 2^20 - 2^0 */ fmul(z2_20_0,t1,z2_10_0); + + /* 2^21 - 2^1 */ fsquare(t0,z2_20_0); + /* 2^22 - 2^2 */ fsquare(t1,t0); + /* 2^40 - 2^20 */ for (i = 2;i < 20;i += 2) { fsquare(t0,t1); fsquare(t1,t0); } + /* 2^40 - 2^0 */ fmul(t0,t1,z2_20_0); + + /* 2^41 - 2^1 */ fsquare(t1,t0); + /* 2^42 - 2^2 */ fsquare(t0,t1); + /* 2^50 - 2^10 */ for (i = 2;i < 10;i += 2) { fsquare(t1,t0); fsquare(t0,t1); } + /* 2^50 - 2^0 */ fmul(z2_50_0,t0,z2_10_0); + + /* 2^51 - 2^1 */ fsquare(t0,z2_50_0); + /* 2^52 - 2^2 */ fsquare(t1,t0); + /* 2^100 - 2^50 */ for (i = 2;i < 50;i += 2) { fsquare(t0,t1); fsquare(t1,t0); } + /* 2^100 - 2^0 */ fmul(z2_100_0,t1,z2_50_0); + + /* 2^101 - 2^1 */ fsquare(t1,z2_100_0); + /* 2^102 - 2^2 */ fsquare(t0,t1); + /* 2^200 - 2^100 */ for (i = 2;i < 100;i += 2) { fsquare(t1,t0); fsquare(t0,t1); } + /* 2^200 - 2^0 */ fmul(t1,t0,z2_100_0); + + /* 2^201 - 2^1 */ fsquare(t0,t1); + /* 2^202 - 2^2 */ fsquare(t1,t0); + /* 2^250 - 2^50 */ for (i = 2;i < 50;i += 2) { fsquare(t0,t1); fsquare(t1,t0); } + /* 2^250 - 2^0 */ fmul(t0,t1,z2_50_0); + + /* 2^251 - 2^1 */ fsquare(t1,t0); + /* 2^252 - 2^2 */ fsquare(t0,t1); + /* 2^253 - 2^3 */ fsquare(t1,t0); + /* 2^254 - 2^4 */ fsquare(t0,t1); + /* 2^255 - 2^5 */ fsquare(t1,t0); + /* 2^255 - 21 */ fmul(out,t1,z11); +} + +static void crypto_scalarmult(u8 *mypublic, const u8 *secret, const u8 *basepoint) { + limb bp[10], x[10], z[11], zmone[10]; + uint8_t e[32]; + int i; + + for (i = 0; i < 32; ++i) e[i] = secret[i]; + e[0] &= 248; + e[31] &= 127; + e[31] |= 64; + + fexpand(bp, basepoint); + cmult(x, z, e, bp); + crecip(zmone, z); + fmul(z, x, zmone); + fcontract(mypublic, z); +} + +#if 0 void add(unsigned int out[32],const unsigned int a[32],const unsigned int b[32]) { unsigned int j; @@ -287,11 +994,12 @@ int crypto_scalarmult(unsigned char *q,const unsigned char *n,const unsigned cha for (i = 0;i < 32;++i) q[i] = work[64 + i]; return 0; } +#endif static const unsigned char base[32] = {9}; -int crypto_scalarmult_base(unsigned char *q,const unsigned char *n) +static inline void crypto_scalarmult_base(unsigned char *q,const unsigned char *n) { - return crypto_scalarmult(q,n,base); + crypto_scalarmult(q,n,base); } //////////////////////////////////////////////////////////////////////////////