1 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 2 // 3 // Adapted from libdivide (http://libdivide.com/) 4 // 5 // libdivide uses the zlib license. This permits usage of libdivide in any product - commercial, GPL, or otherwise. 6 // No attribution is required. However, you must not claim to have written libdivide, or distribute an altered 7 // version without plainly marking it as modified. 8 // 9 // I (tomer) did some dlang-styling to the code but I have no idea what's going on there, and I'd like to 10 // keep it that way. See unittest for API. 11 // 12 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 13 module std.math.divide; // presumably that's where it should go 14 15 import std.stdint; 16 17 private: 18 19 enum { 20 LIBDIVIDE_32_SHIFT_MASK = 0x1F, 21 LIBDIVIDE_64_SHIFT_MASK = 0x3F, 22 LIBDIVIDE_ADD_MARKER = 0x40, 23 LIBDIVIDE_U32_SHIFT_PATH = 0x80, 24 LIBDIVIDE_U64_SHIFT_PATH = 0x80, 25 LIBDIVIDE_S32_SHIFT_PATH = 0x20, 26 LIBDIVIDE_NEGATIVE_DIVISOR = 0x80, 27 } 28 29 static int32_t libdivide__count_trailing_zeros32(uint32_t val) { 30 /* Fast way to count trailing zeros */ 31 //return __builtin_ctz(val); 32 /* Dorky way to count trailing zeros. Note that this hangs for val = 0! */ 33 int32_t result = 0; 34 val = (val ^ (val - 1)) >> 1; // Set v's trailing 0s to 1s and zero rest 35 while (val) { 36 val >>= 1; 37 result++; 38 } 39 return result; 40 } 41 42 static int32_t libdivide__count_leading_zeros32(uint32_t val) { 43 /* Fast way to count leading zeros */ 44 //return __builtin_clz(val); 45 /* Dorky way to count leading zeros. Note that this hangs for val = 0! */ 46 int32_t result = 0; 47 while (! (val & (1U << 31))) { 48 val <<= 1; 49 result++; 50 } 51 return result; 52 } 53 54 static uint32_t libdivide_64_div_32_to_32(uint32_t u1, uint32_t u0, uint32_t v, uint32_t *r) { 55 //libdivide_64_div_32_to_32: divides a 64 bit uint {u1, u0} by a 32 bit uint {v}. The result must fit in 32 bits. 56 //Returns the quotient directly and the remainder in *r 57 //#if (LIBDIVIDE_IS_X86_64) 58 // uint32_t result; 59 // __asm__("divl %[v]" 60 // : "=a"(result), "=d"(*r) 61 // : [v] "r"(v), "a"(u0), "d"(u1) 62 // ); 63 // return result; 64 //} 65 //#else 66 uint64_t n = ((cast(uint64_t)u1) << 32) | u0; 67 uint32_t result = cast(uint32_t)(n / v); 68 *r = cast(uint32_t)(n - result * cast(uint64_t)v); 69 return result; 70 //#endif 71 } 72 73 static uint32_t libdivide__mullhi_u32(uint32_t x, uint32_t y) { 74 uint64_t xl = x, yl = y; 75 uint64_t rl = xl * yl; 76 return cast(uint32_t)(rl >> 32); 77 } 78 79 static int32_t libdivide__count_trailing_zeros64(uint64_t val) { 80 // Fast way to count trailing zeros. Note that we disable this in 32 bit because gcc does something horrible - 81 // it calls through to a dynamically bound function. 82 //return __builtin_ctzll(val); 83 // Pretty good way to count trailing zeros. Note that this hangs for val = 0 84 assert (val != 0); 85 uint32_t lo = val & 0xFFFFFFFF; 86 if (lo != 0) return libdivide__count_trailing_zeros32(lo); 87 return 32 + libdivide__count_trailing_zeros32(val >> 32); 88 } 89 90 static int64_t libdivide__mullhi_s64(int64_t x, int64_t y) { 91 static if (is(cent)) { 92 cent xl = x, yl = y; 93 cent rl = xl * yl; 94 return cast(cent)(rl >> 64); 95 } 96 else { 97 //full 128 bits are x0 * y0 + (x0 * y1 << 32) + (x1 * y0 << 32) + (x1 * y1 << 64) 98 const uint32_t mask = 0xFFFFFFFF; 99 const uint32_t x0 = cast(uint32_t)(x & mask), y0 = cast(uint32_t)(y & mask); 100 const int32_t x1 = cast(int32_t)(x >> 32), y1 = cast(int32_t)(y >> 32); 101 const uint32_t x0y0_hi = libdivide__mullhi_u32(x0, y0); 102 const int64_t t = x1*cast(int64_t)y0 + x0y0_hi; 103 const int64_t w1 = x0*cast(int64_t)y1 + (t & mask); 104 return x1*cast(int64_t)y1 + (t >> 32) + (w1 >> 32); 105 } 106 } 107 108 static int32_t libdivide__mullhi_s32(int32_t x, int32_t y) { 109 int64_t xl = x, yl = y; 110 int64_t rl = xl * yl; 111 return cast(int32_t)(rl >> 32); //needs to be arithmetic shift 112 } 113 114 static int32_t libdivide__count_leading_zeros64(uint64_t val) { 115 /* Fast way to count leading zeros */ 116 //return __builtin_clzll(val); 117 /* Dorky way to count leading zeros. Note that this hangs for val = 0! */ 118 int32_t result = 0; 119 while (! (val & (1UL << 63))) { 120 val <<= 1; 121 result++; 122 } 123 return result; 124 } 125 126 static uint64_t libdivide_128_div_64_to_64(uint64_t u1, uint64_t u0, uint64_t v, uint64_t *r) { 127 const uint64_t b = (1UL << 32); // Number base (16 bits). 128 uint64_t un1, un0, // Norm. dividend LSD's. 129 vn1, vn0, // Norm. divisor digits. 130 q1, q0, // Quotient digits. 131 un64, un21, un10,// Dividend digit pairs. 132 rhat; // A remainder. 133 int s; // Shift amount for norm. 134 135 if (u1 >= v) { // If overflow, set rem. 136 if (r !is null) { // to an impossible value, 137 *r = cast(uint64_t)(-1); // and return the largest 138 } 139 else { 140 return cast(uint64_t)(-1); // possible quotient. 141 } 142 } 143 144 /* count leading zeros */ 145 s = libdivide__count_leading_zeros64(v); // 0 <= s <= 63. 146 if (s > 0) { 147 v = v << s; // Normalize divisor. 148 un64 = (u1 << s) | ((u0 >> (64 - s)) & (-s >> 31)); 149 un10 = u0 << s; // Shift dividend left. 150 } 151 else { 152 // Avoid undefined behavior. 153 un64 = u1 | u0; 154 un10 = u0; 155 } 156 157 vn1 = v >> 32; // Break divisor up into 158 vn0 = v & 0xFFFFFFFF; // two 32-bit digits. 159 160 un1 = un10 >> 32; // Break right half of 161 un0 = un10 & 0xFFFFFFFF; // dividend into two digits. 162 163 q1 = un64/vn1; // Compute the first 164 rhat = un64 - q1*vn1; // quotient digit, q1. 165 again1: 166 if (q1 >= b || q1*vn0 > b*rhat + un1) { 167 q1 = q1 - 1; 168 rhat = rhat + vn1; 169 if (rhat < b) goto again1; 170 } 171 172 un21 = un64*b + un1 - q1*v; // Multiply and subtract. 173 174 q0 = un21/vn1; // Compute the second 175 rhat = un21 - q0*vn1; // quotient digit, q0. 176 again2: 177 if (q0 >= b || q0*vn0 > b*rhat + un0) { 178 q0 = q0 - 1; 179 rhat = rhat + vn1; 180 if (rhat < b) goto again2; 181 } 182 183 if (r !is null) { // If remainder is wanted, 184 *r = (un21*b + un0 - q0*v) >> s; // return it. 185 } 186 return q1*b + q0; 187 } 188 189 static uint64_t libdivide__mullhi_u64(uint64_t x, uint64_t y) { 190 static if (is(ucent)) { 191 ucent xl = x, yl = y; 192 ucent rl = xl * yl; 193 return cast(ucent)(rl >> 64); 194 } 195 else { 196 //full 128 bits are x0 * y0 + (x0 * y1 << 32) + (x1 * y0 << 32) + (x1 * y1 << 64) 197 const uint32_t mask = 0xFFFFFFFF; 198 const uint32_t x0 = cast(uint32_t)(x & mask), x1 = cast(uint32_t)(x >> 32); 199 const uint32_t y0 = cast(uint32_t)(y & mask), y1 = cast(uint32_t)(y >> 32); 200 const uint32_t x0y0_hi = libdivide__mullhi_u32(x0, y0); 201 const uint64_t x0y1 = x0 * cast(uint64_t)y1; 202 const uint64_t x1y0 = x1 * cast(uint64_t)y0; 203 const uint64_t x1y1 = x1 * cast(uint64_t)y1; 204 205 uint64_t temp = x1y0 + x0y0_hi; 206 uint64_t temp_lo = temp & mask, temp_hi = temp >> 32; 207 return x1y1 + temp_hi + ((temp_lo + x0y1) >> 32); 208 } 209 } 210 211 public: 212 213 struct S32Denominator { 214 alias Type = typeof(magic); 215 int32_t magic; 216 uint8_t more; 217 218 this(int32_t d) { 219 assert (d > 0, "d<=0"); 220 221 // If d is a power of 2, or negative a power of 2, we have to use a shift. This is especially 222 // important because the magic algorithm fails for -1. To check if d is a power of 2 or its inverse, 223 // it suffices to check whether its absolute value has exactly one bit set. This works even for INT_MIN, 224 // because abs(INT_MIN) == INT_MIN, and INT_MIN has one bit set and is a power of 2. 225 uint32_t absD = cast(uint32_t)(d < 0 ? -d : d); //gcc optimizes this to the fast abs trick 226 if ((absD & (absD - 1)) == 0) { //check if exactly one bit is set, don't care if absD is 0 since that's divide by zero 227 this.magic = 0; 228 this.more = cast(uint8_t)(libdivide__count_trailing_zeros32(absD) | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0) | LIBDIVIDE_S32_SHIFT_PATH); 229 } 230 else { 231 const uint32_t floor_log_2_d = cast(uint8_t)(31 - libdivide__count_leading_zeros32(absD)); 232 assert(floor_log_2_d >= 1); 233 234 uint8_t more; 235 //the dividend here is 2**(floor_log_2_d + 31), so the low 32 bit word is 0 and the high word is floor_log_2_d - 1 236 uint32_t rem, proposed_m; 237 proposed_m = libdivide_64_div_32_to_32(1U << (floor_log_2_d - 1), 0, absD, &rem); 238 const uint32_t e = absD - rem; 239 240 /* We are going to start with a power of floor_log_2_d - 1. This works if works if e < 2**floor_log_2_d. */ 241 if (e < (1U << floor_log_2_d)) { 242 /* This power works */ 243 more = cast(uint8_t)(floor_log_2_d - 1); 244 } 245 else { 246 // We need to go one higher. This should not make proposed_m overflow, but it will make it negative 247 // when interpreted as an int32_t. 248 proposed_m += proposed_m; 249 const uint32_t twice_rem = rem + rem; 250 if (twice_rem >= absD || twice_rem < rem) proposed_m += 1; 251 more = cast(uint8_t)(floor_log_2_d | LIBDIVIDE_ADD_MARKER | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0)); //use the general algorithm 252 } 253 proposed_m += 1; 254 this.magic = (d < 0 ? -cast(int32_t)proposed_m : cast(int32_t)proposed_m); 255 this.more = more; 256 } 257 } 258 259 int32_t opBinaryRight(string op: "/")(int32_t numer) { 260 if (more & LIBDIVIDE_S32_SHIFT_PATH) { 261 uint8_t shifter = more & LIBDIVIDE_32_SHIFT_MASK; 262 int32_t q = numer + ((numer >> 31) & ((1 << shifter) - 1)); 263 q = q >> shifter; 264 int32_t shiftMask = cast(int8_t)(more >> 7); //must be arithmetic shift and then sign-extend 265 q = (q ^ shiftMask) - shiftMask; 266 return q; 267 } 268 else { 269 int32_t q = libdivide__mullhi_s32(magic, numer); 270 if (more & LIBDIVIDE_ADD_MARKER) { 271 int32_t sign = cast(int8_t)(more >> 7); //must be arithmetic shift and then sign extend 272 q += ((numer ^ sign) - sign); 273 } 274 q >>= more & LIBDIVIDE_32_SHIFT_MASK; 275 q += (q < 0); 276 return q; 277 } 278 } 279 } 280 281 struct U32Denominator { 282 alias Type = typeof(magic); 283 uint32_t magic; 284 uint8_t more; 285 286 this(uint32_t d) { 287 assert (d > 0, "d==0"); 288 if ((d & (d - 1)) == 0) { 289 this.magic = 0; 290 this.more = cast(uint8_t)(libdivide__count_trailing_zeros32(d) | LIBDIVIDE_U32_SHIFT_PATH); 291 } 292 else { 293 const uint32_t floor_log_2_d = 31 - libdivide__count_leading_zeros32(d); 294 295 uint8_t more; 296 uint32_t rem, proposed_m; 297 proposed_m = libdivide_64_div_32_to_32(1U << floor_log_2_d, 0, d, &rem); 298 299 assert(rem > 0 && rem < d); 300 const uint32_t e = d - rem; 301 302 /* This power works if e < 2**floor_log_2_d. */ 303 if (e < (1U << floor_log_2_d)) { 304 /* This power works */ 305 more = cast(uint8_t)floor_log_2_d; 306 } 307 else { 308 // We have to use the general 33-bit algorithm. We need to compute (2**power) / d. 309 // However, we already have (2**(power-1))/d and its remainder. By doubling both, and then 310 // correcting the remainder, we can compute the larger division. */ 311 proposed_m += proposed_m; //don't care about overflow here - in fact, we expect it 312 const uint32_t twice_rem = rem + rem; 313 if (twice_rem >= d || twice_rem < rem) proposed_m += 1; 314 more = cast(uint8_t)(floor_log_2_d | LIBDIVIDE_ADD_MARKER); 315 } 316 this.magic = 1 + proposed_m; 317 this.more = more; 318 //result.more's shift should in general be ceil_log_2_d. But if we used the smaller power, we 319 //subtract one from the shift because we're using the smaller power. If we're using the larger power, 320 //we subtract one from the shift because it's taken care of by the add indicator. 321 //So floor_log_2_d happens to be correct in both cases. 322 } 323 } 324 325 uint32_t opBinaryRight(string op: "/")(uint32_t numer) { 326 if (more & LIBDIVIDE_U32_SHIFT_PATH) { 327 return numer >> (more & LIBDIVIDE_32_SHIFT_MASK); 328 } 329 else { 330 uint32_t q = libdivide__mullhi_u32(magic, numer); 331 if (more & LIBDIVIDE_ADD_MARKER) { 332 uint32_t t = ((numer - q) >> 1) + q; 333 return t >> (more & LIBDIVIDE_32_SHIFT_MASK); 334 } 335 else { 336 return q >> more; //all upper bits are 0 - don't need to mask them off 337 } 338 } 339 } 340 } 341 342 struct S64Denominator { 343 alias Type = typeof(magic); 344 int64_t magic; 345 uint8_t more; 346 347 this(int64_t d) { 348 assert (d > 0, "d<=0"); 349 // If d is a power of 2, or negative a power of 2, we have to use a shift. This is especially important 350 // because the magic algorithm fails for -1. To check if d is a power of 2 or its inverse, it suffices 351 // to check whether its absolute value has exactly one bit set. This works even for INT_MIN, 352 // because abs(INT_MIN) == INT_MIN, and INT_MIN has one bit set and is a power of 2. 353 const uint64_t absD = cast(uint64_t)(d < 0 ? -d : d); //gcc optimizes this to the fast abs trick 354 if ((absD & (absD - 1)) == 0) { //check if exactly one bit is set, don't care if absD is 0 since that's divide by zero 355 this.more = cast(ubyte)(libdivide__count_trailing_zeros64(absD) | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0)); 356 this.magic = 0; 357 } 358 else { 359 const uint32_t floor_log_2_d = cast(uint32_t)(63 - libdivide__count_leading_zeros64(absD)); 360 361 //the dividend here is 2**(floor_log_2_d + 63), so the low 64 bit word is 0 and the high word is floor_log_2_d - 1 362 uint8_t more; 363 uint64_t rem, proposed_m; 364 proposed_m = libdivide_128_div_64_to_64(1UL << (floor_log_2_d - 1), 0, absD, &rem); 365 const uint64_t e = absD - rem; 366 367 /* We are going to start with a power of floor_log_2_d - 1. This works if works if e < 2**floor_log_2_d. */ 368 if (e < (1UL << floor_log_2_d)) { 369 /* This power works */ 370 more = cast(ubyte)(floor_log_2_d - 1); 371 } 372 else { 373 // We need to go one higher. This should not make proposed_m overflow, but it will make it 374 // negative when interpreted as an int32_t. 375 proposed_m += proposed_m; 376 const uint64_t twice_rem = rem + rem; 377 if (twice_rem >= absD || twice_rem < rem) proposed_m += 1; 378 more = cast(ubyte)(floor_log_2_d | LIBDIVIDE_ADD_MARKER | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0)); 379 } 380 proposed_m += 1; 381 this.more = more; 382 this.magic = (d < 0 ? -cast(int64_t)proposed_m : cast(int64_t)proposed_m); 383 } 384 } 385 386 int64_t opBinaryRight(string op: "/")(int64_t numer) { 387 if (magic == 0) { //shift path 388 uint32_t shifter = more & LIBDIVIDE_64_SHIFT_MASK; 389 int64_t q = numer + ((numer >> 63) & ((1L << shifter) - 1)); 390 q = q >> shifter; 391 int64_t shiftMask = cast(int8_t)(more >> 7); //must be arithmetic shift and then sign-extend 392 q = (q ^ shiftMask) - shiftMask; 393 return q; 394 } 395 else { 396 int64_t q = libdivide__mullhi_s64(magic, numer); 397 if (more & LIBDIVIDE_ADD_MARKER) { 398 int64_t sign = cast(int8_t)(more >> 7); //must be arithmetic shift and then sign extend 399 q += ((numer ^ sign) - sign); 400 } 401 q >>= more & LIBDIVIDE_64_SHIFT_MASK; 402 q += (q < 0); 403 return q; 404 } 405 } 406 } 407 408 struct U64Denominator { 409 alias Type = typeof(magic); 410 uint64_t magic; 411 uint8_t more; 412 413 this(uint64_t d) { 414 assert (d > 0, "d==0"); 415 if ((d & (d - 1)) == 0) { 416 this.more = cast(uint8_t)(libdivide__count_trailing_zeros64(d) | LIBDIVIDE_U64_SHIFT_PATH); 417 this.magic = 0; 418 } 419 else { 420 const uint32_t floor_log_2_d = 63 - libdivide__count_leading_zeros64(d); 421 422 uint64_t proposed_m, rem; 423 uint8_t more; 424 proposed_m = libdivide_128_div_64_to_64(1UL << floor_log_2_d, 0, d, &rem); //== (1 << (64 + floor_log_2_d)) / d 425 426 assert(rem > 0 && rem < d); 427 const uint64_t e = d - rem; 428 429 /* This power works if e < 2**floor_log_2_d. */ 430 if (e < (1UL << floor_log_2_d)) { 431 /* This power works */ 432 more = cast(uint8_t)floor_log_2_d; 433 } 434 else { 435 // We have to use the general 65-bit algorithm. We need to compute (2**power) / d. However, 436 // we already have (2**(power-1))/d and its remainder. By doubling both, and then correcting 437 // the remainder, we can compute the larger division. 438 proposed_m += proposed_m; //don't care about overflow here - in fact, we expect it 439 const uint64_t twice_rem = rem + rem; 440 if (twice_rem >= d || twice_rem < rem) proposed_m += 1; 441 more = cast(uint8_t)(floor_log_2_d | LIBDIVIDE_ADD_MARKER); 442 } 443 this.magic = 1 + proposed_m; 444 this.more = more; 445 //result.more's shift should in general be ceil_log_2_d. But if we used the smaller power, we subtract 446 //one from the shift because we're using the smaller power. If we're using the larger power, we subtract 447 //one from the shift because it's taken care of by the add indicator. So floor_log_2_d happens to be 448 //correct in both cases, which is why we do it outside of the if statement. 449 } 450 } 451 452 uint64_t opBinaryRight(string op: "/")(uint64_t numer) { 453 if (more & LIBDIVIDE_U64_SHIFT_PATH) { 454 return numer >> (more & LIBDIVIDE_64_SHIFT_MASK); 455 } 456 else { 457 uint64_t q = libdivide__mullhi_u64(magic, numer); 458 if (more & LIBDIVIDE_ADD_MARKER) { 459 uint64_t t = ((numer - q) >> 1) + q; 460 return t >> (more & LIBDIVIDE_64_SHIFT_MASK); 461 } 462 else { 463 return q >> more; //all upper bits are 0 - don't need to mask them off 464 } 465 } 466 } 467 } 468 469 auto denominator(T)(T value) { 470 static if (is(T == uint32_t)) { 471 return U32Denominator(value); 472 } 473 else static if (is(T == int32_t)) { 474 return S32Denominator(value); 475 } 476 else static if (is(T == uint64_t)) { 477 return U64Denominator(value); 478 } 479 else static if (is(T == int64_t)) { 480 return S64Denominator(value); 481 } 482 else { 483 static assert (false, "T must be an int, uint, long, ulong, not " ~ T.stringof); 484 } 485 } 486 487 488 unittest { 489 static assert (1000 / U32Denominator(31) == 32); 490 assert (1000 / U32Denominator(31) == 32); 491 assert (1000 / S32Denominator(50) == 20); 492 assert (1000 / denominator(11) == 90); 493 } 494 495 unittest { 496 import std.random; 497 import std..string; 498 import std.typetuple; 499 500 int counter; 501 alias denominators = TypeTuple!(S32Denominator, U32Denominator, S64Denominator, U64Denominator); 502 enum tests = 5000; 503 504 foreach(D; denominators) { 505 foreach(j; 0 .. tests) { 506 D.Type d = uniform(1, D.Type.max); 507 D.Type n = uniform(D.Type.min, D.Type.max); 508 D.Type q = n / d; 509 510 D d2 = D(d); 511 D.Type q2 = n / d2; 512 assert (q == q2, "%s/%s = %s, not %s".format(n, d, q, q2)); 513 counter++; 514 } 515 } 516 assert(counter == denominators.length * tests, "counter=%s".format(counter)); 517 } 518 519 //void main(){}