1 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2 //
3 // Adapted from libdivide (http://libdivide.com/)
4 //
5 // libdivide uses the zlib license. This permits usage of libdivide in any product - commercial, GPL, or otherwise.
6 // No attribution is required. However, you must not claim to have written libdivide, or distribute an altered
7 // version without plainly marking it as modified.
8 //
9 // I (tomer) did some dlang-styling to the code but I have no idea what's going on there, and I'd like to
10 // keep it that way. See unittest for API.
11 //
12 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
13 module std.math.divide;  // presumably that's where it should go
14 
15 import std.stdint;
16 
17 private:
18 
19 enum {
20     LIBDIVIDE_32_SHIFT_MASK = 0x1F,
21     LIBDIVIDE_64_SHIFT_MASK = 0x3F,
22     LIBDIVIDE_ADD_MARKER = 0x40,
23     LIBDIVIDE_U32_SHIFT_PATH = 0x80,
24     LIBDIVIDE_U64_SHIFT_PATH = 0x80,
25     LIBDIVIDE_S32_SHIFT_PATH = 0x20,
26     LIBDIVIDE_NEGATIVE_DIVISOR = 0x80,
27 }
28 
29 static int32_t libdivide__count_trailing_zeros32(uint32_t val) {
30     /* Fast way to count trailing zeros */
31     //return __builtin_ctz(val);
32     /* Dorky way to count trailing zeros.   Note that this hangs for val = 0! */
33     int32_t result = 0;
34     val = (val ^ (val - 1)) >> 1;  // Set v's trailing 0s to 1s and zero rest
35     while (val) {
36         val >>= 1;
37         result++;
38     }
39     return result;
40 }
41 
42 static int32_t libdivide__count_leading_zeros32(uint32_t val) {
43     /* Fast way to count leading zeros */
44     //return __builtin_clz(val);
45     /* Dorky way to count leading zeros.  Note that this hangs for val = 0! */
46     int32_t result = 0;
47     while (! (val & (1U << 31))) {
48         val <<= 1;
49         result++;
50     }
51     return result;
52 }
53 
54 static uint32_t libdivide_64_div_32_to_32(uint32_t u1, uint32_t u0, uint32_t v, uint32_t *r) {
55 //libdivide_64_div_32_to_32: divides a 64 bit uint {u1, u0} by a 32 bit uint {v}.  The result must fit in 32 bits.
56 //Returns the quotient directly and the remainder in *r
57 //#if (LIBDIVIDE_IS_X86_64)
58 //    uint32_t result;
59 //    __asm__("divl %[v]"
60 //            : "=a"(result), "=d"(*r)
61 //            : [v] "r"(v), "a"(u0), "d"(u1)
62 //            );
63 //    return result;
64 //}
65 //#else
66     uint64_t n = ((cast(uint64_t)u1) << 32) | u0;
67     uint32_t result = cast(uint32_t)(n / v);
68     *r = cast(uint32_t)(n - result * cast(uint64_t)v);
69     return result;
70 //#endif
71 }
72 
73 static uint32_t libdivide__mullhi_u32(uint32_t x, uint32_t y) {
74     uint64_t xl = x, yl = y;
75     uint64_t rl = xl * yl;
76     return cast(uint32_t)(rl >> 32);
77 }
78 
79 static int32_t libdivide__count_trailing_zeros64(uint64_t val) {
80     // Fast way to count trailing zeros.  Note that we disable this in 32 bit because gcc does something horrible -
81     // it calls through to a dynamically bound function.
82     //return __builtin_ctzll(val);
83     // Pretty good way to count trailing zeros.  Note that this hangs for val = 0
84     assert (val != 0);
85     uint32_t lo = val & 0xFFFFFFFF;
86     if (lo != 0) return libdivide__count_trailing_zeros32(lo);
87     return 32 + libdivide__count_trailing_zeros32(val >> 32);
88 }
89 
90 static int64_t libdivide__mullhi_s64(int64_t x, int64_t y) {
91     static if (is(cent)) {
92         cent xl = x, yl = y;
93         cent rl = xl * yl;
94         return cast(cent)(rl >> 64);
95     }
96     else {
97         //full 128 bits are x0 * y0 + (x0 * y1 << 32) + (x1 * y0 << 32) + (x1 * y1 << 64)
98         const uint32_t mask = 0xFFFFFFFF;
99         const uint32_t x0 = cast(uint32_t)(x & mask), y0 = cast(uint32_t)(y & mask);
100         const int32_t x1 = cast(int32_t)(x >> 32), y1 = cast(int32_t)(y >> 32);
101         const uint32_t x0y0_hi = libdivide__mullhi_u32(x0, y0);
102         const int64_t t = x1*cast(int64_t)y0 + x0y0_hi;
103         const int64_t w1 = x0*cast(int64_t)y1 + (t & mask);
104         return x1*cast(int64_t)y1 + (t >> 32) + (w1 >> 32);
105     }
106 }
107 
108 static int32_t libdivide__mullhi_s32(int32_t x, int32_t y) {
109     int64_t xl = x, yl = y;
110     int64_t rl = xl * yl;
111     return cast(int32_t)(rl >> 32); //needs to be arithmetic shift
112 }
113 
114 static int32_t libdivide__count_leading_zeros64(uint64_t val) {
115     /* Fast way to count leading zeros */
116     //return __builtin_clzll(val);
117     /* Dorky way to count leading zeros.  Note that this hangs for val = 0! */
118     int32_t result = 0;
119     while (! (val & (1UL << 63))) {
120         val <<= 1;
121         result++;
122     }
123     return result;
124 }
125 
126 static uint64_t libdivide_128_div_64_to_64(uint64_t u1, uint64_t u0, uint64_t v, uint64_t *r) {
127     const uint64_t b = (1UL << 32); // Number base (16 bits).
128     uint64_t un1, un0,        // Norm. dividend LSD's.
129     vn1, vn0,        // Norm. divisor digits.
130     q1, q0,          // Quotient digits.
131     un64, un21, un10,// Dividend digit pairs.
132     rhat;            // A remainder.
133     int s;                  // Shift amount for norm.
134 
135     if (u1 >= v) {            // If overflow, set rem.
136         if (r !is null) {        // to an impossible value,
137             *r = cast(uint64_t)(-1);    // and return the largest
138         }
139         else {
140             return cast(uint64_t)(-1);    // possible quotient.
141         }
142     }
143 
144     /* count leading zeros */
145     s = libdivide__count_leading_zeros64(v); // 0 <= s <= 63.
146     if (s > 0) {
147         v = v << s;           // Normalize divisor.
148         un64 = (u1 << s) | ((u0 >> (64 - s)) & (-s >> 31));
149         un10 = u0 << s;       // Shift dividend left.
150     }
151     else {
152         // Avoid undefined behavior.
153         un64 = u1 | u0;
154         un10 = u0;
155     }
156 
157     vn1 = v >> 32;            // Break divisor up into
158     vn0 = v & 0xFFFFFFFF;     // two 32-bit digits.
159 
160     un1 = un10 >> 32;         // Break right half of
161     un0 = un10 & 0xFFFFFFFF;  // dividend into two digits.
162 
163     q1 = un64/vn1;            // Compute the first
164     rhat = un64 - q1*vn1;     // quotient digit, q1.
165 again1:
166     if (q1 >= b || q1*vn0 > b*rhat + un1) {
167         q1 = q1 - 1;
168         rhat = rhat + vn1;
169         if (rhat < b) goto again1;
170     }
171 
172     un21 = un64*b + un1 - q1*v;  // Multiply and subtract.
173 
174     q0 = un21/vn1;            // Compute the second
175     rhat = un21 - q0*vn1;     // quotient digit, q0.
176 again2:
177     if (q0 >= b || q0*vn0 > b*rhat + un0) {
178         q0 = q0 - 1;
179         rhat = rhat + vn1;
180         if (rhat < b) goto again2;
181     }
182 
183     if (r !is null) {           // If remainder is wanted,
184         *r = (un21*b + un0 - q0*v) >> s;     // return it.
185     }
186     return q1*b + q0;
187 }
188 
189 static uint64_t libdivide__mullhi_u64(uint64_t x, uint64_t y) {
190     static if (is(ucent)) {
191         ucent xl = x, yl = y;
192         ucent rl = xl * yl;
193         return cast(ucent)(rl >> 64);
194     }
195     else {
196         //full 128 bits are x0 * y0 + (x0 * y1 << 32) + (x1 * y0 << 32) + (x1 * y1 << 64)
197         const uint32_t mask = 0xFFFFFFFF;
198         const uint32_t x0 = cast(uint32_t)(x & mask), x1 = cast(uint32_t)(x >> 32);
199         const uint32_t y0 = cast(uint32_t)(y & mask), y1 = cast(uint32_t)(y >> 32);
200         const uint32_t x0y0_hi = libdivide__mullhi_u32(x0, y0);
201         const uint64_t x0y1 = x0 * cast(uint64_t)y1;
202         const uint64_t x1y0 = x1 * cast(uint64_t)y0;
203         const uint64_t x1y1 = x1 * cast(uint64_t)y1;
204 
205         uint64_t temp = x1y0 + x0y0_hi;
206         uint64_t temp_lo = temp & mask, temp_hi = temp >> 32;
207         return x1y1 + temp_hi + ((temp_lo + x0y1) >> 32);
208     }
209 }
210 
211 public:
212 
213 struct S32Denominator {
214     alias Type = typeof(magic);
215     int32_t magic;
216     uint8_t more;
217 
218     this(int32_t d) {
219         assert (d > 0, "d<=0");
220 
221         // If d is a power of 2, or negative a power of 2, we have to use a shift.  This is especially
222         // important because the magic algorithm fails for -1.  To check if d is a power of 2 or its inverse,
223         // it suffices to check whether its absolute value has exactly one bit set.  This works even for INT_MIN,
224         // because abs(INT_MIN) == INT_MIN, and INT_MIN has one bit set and is a power of 2.
225         uint32_t absD = cast(uint32_t)(d < 0 ? -d : d); //gcc optimizes this to the fast abs trick
226         if ((absD & (absD - 1)) == 0) { //check if exactly one bit is set, don't care if absD is 0 since that's divide by zero
227             this.magic = 0;
228             this.more = cast(uint8_t)(libdivide__count_trailing_zeros32(absD) | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0) | LIBDIVIDE_S32_SHIFT_PATH);
229         }
230         else {
231             const uint32_t floor_log_2_d = cast(uint8_t)(31 - libdivide__count_leading_zeros32(absD));
232             assert(floor_log_2_d >= 1);
233 
234             uint8_t more;
235             //the dividend here is 2**(floor_log_2_d + 31), so the low 32 bit word is 0 and the high word is floor_log_2_d - 1
236             uint32_t rem, proposed_m;
237             proposed_m = libdivide_64_div_32_to_32(1U << (floor_log_2_d - 1), 0, absD, &rem);
238             const uint32_t e = absD - rem;
239 
240             /* We are going to start with a power of floor_log_2_d - 1.  This works if works if e < 2**floor_log_2_d. */
241             if (e < (1U << floor_log_2_d)) {
242                 /* This power works */
243                 more = cast(uint8_t)(floor_log_2_d - 1);
244             }
245             else {
246                 // We need to go one higher.  This should not make proposed_m overflow, but it will make it negative
247                 // when interpreted as an int32_t.
248                 proposed_m += proposed_m;
249                 const uint32_t twice_rem = rem + rem;
250                 if (twice_rem >= absD || twice_rem < rem) proposed_m += 1;
251                 more = cast(uint8_t)(floor_log_2_d | LIBDIVIDE_ADD_MARKER | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0)); //use the general algorithm
252             }
253             proposed_m += 1;
254             this.magic = (d < 0 ? -cast(int32_t)proposed_m : cast(int32_t)proposed_m);
255             this.more = more;
256         }
257     }
258 
259     int32_t opBinaryRight(string op: "/")(int32_t numer) {
260         if (more & LIBDIVIDE_S32_SHIFT_PATH) {
261             uint8_t shifter = more & LIBDIVIDE_32_SHIFT_MASK;
262             int32_t q = numer + ((numer >> 31) & ((1 << shifter) - 1));
263             q = q >> shifter;
264             int32_t shiftMask = cast(int8_t)(more >> 7); //must be arithmetic shift and then sign-extend
265             q = (q ^ shiftMask) - shiftMask;
266             return q;
267         }
268         else {
269             int32_t q = libdivide__mullhi_s32(magic, numer);
270             if (more & LIBDIVIDE_ADD_MARKER) {
271                 int32_t sign = cast(int8_t)(more >> 7); //must be arithmetic shift and then sign extend
272                 q += ((numer ^ sign) - sign);
273             }
274             q >>= more & LIBDIVIDE_32_SHIFT_MASK;
275             q += (q < 0);
276             return q;
277         }
278     }
279 }
280 
281 struct U32Denominator {
282     alias Type = typeof(magic);
283     uint32_t magic;
284     uint8_t more;
285 
286     this(uint32_t d) {
287         assert (d > 0, "d==0");
288         if ((d & (d - 1)) == 0) {
289             this.magic = 0;
290             this.more = cast(uint8_t)(libdivide__count_trailing_zeros32(d) | LIBDIVIDE_U32_SHIFT_PATH);
291         }
292         else {
293             const uint32_t floor_log_2_d = 31 - libdivide__count_leading_zeros32(d);
294 
295             uint8_t more;
296             uint32_t rem, proposed_m;
297             proposed_m = libdivide_64_div_32_to_32(1U << floor_log_2_d, 0, d, &rem);
298 
299             assert(rem > 0 && rem < d);
300             const uint32_t e = d - rem;
301 
302             /* This power works if e < 2**floor_log_2_d. */
303             if (e < (1U << floor_log_2_d)) {
304                 /* This power works */
305                 more = cast(uint8_t)floor_log_2_d;
306             }
307             else {
308                 // We have to use the general 33-bit algorithm.  We need to compute (2**power) / d.
309                 // However, we already have (2**(power-1))/d and its remainder.  By doubling both, and then
310                 // correcting the remainder, we can compute the larger division. */
311                 proposed_m += proposed_m; //don't care about overflow here - in fact, we expect it
312                 const uint32_t twice_rem = rem + rem;
313                 if (twice_rem >= d || twice_rem < rem) proposed_m += 1;
314                 more = cast(uint8_t)(floor_log_2_d | LIBDIVIDE_ADD_MARKER);
315             }
316             this.magic = 1 + proposed_m;
317             this.more = more;
318             //result.more's shift should in general be ceil_log_2_d.  But if we used the smaller power, we
319             //subtract one from the shift because we're using the smaller power. If we're using the larger power,
320             //we subtract one from the shift because it's taken care of by the add indicator.
321             //So floor_log_2_d happens to be correct in both cases.
322         }
323     }
324 
325     uint32_t opBinaryRight(string op: "/")(uint32_t numer) {
326         if (more & LIBDIVIDE_U32_SHIFT_PATH) {
327             return numer >> (more & LIBDIVIDE_32_SHIFT_MASK);
328         }
329         else {
330             uint32_t q = libdivide__mullhi_u32(magic, numer);
331             if (more & LIBDIVIDE_ADD_MARKER) {
332                 uint32_t t = ((numer - q) >> 1) + q;
333                 return t >> (more & LIBDIVIDE_32_SHIFT_MASK);
334             }
335             else {
336                 return q >> more; //all upper bits are 0 - don't need to mask them off
337             }
338         }
339     }
340 }
341 
342 struct S64Denominator {
343     alias Type = typeof(magic);
344     int64_t magic;
345     uint8_t more;
346 
347     this(int64_t d) {
348         assert (d > 0, "d<=0");
349         // If d is a power of 2, or negative a power of 2, we have to use a shift.  This is especially important
350         // because the magic algorithm fails for -1.  To check if d is a power of 2 or its inverse, it suffices
351         // to check whether its absolute value has exactly one bit set.  This works even for INT_MIN,
352         // because abs(INT_MIN) == INT_MIN, and INT_MIN has one bit set and is a power of 2.
353         const uint64_t absD = cast(uint64_t)(d < 0 ? -d : d); //gcc optimizes this to the fast abs trick
354         if ((absD & (absD - 1)) == 0) { //check if exactly one bit is set, don't care if absD is 0 since that's divide by zero
355             this.more = cast(ubyte)(libdivide__count_trailing_zeros64(absD) | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0));
356             this.magic = 0;
357         }
358         else {
359             const uint32_t floor_log_2_d = cast(uint32_t)(63 - libdivide__count_leading_zeros64(absD));
360 
361             //the dividend here is 2**(floor_log_2_d + 63), so the low 64 bit word is 0 and the high word is floor_log_2_d - 1
362             uint8_t more;
363             uint64_t rem, proposed_m;
364             proposed_m = libdivide_128_div_64_to_64(1UL << (floor_log_2_d - 1), 0, absD, &rem);
365             const uint64_t e = absD - rem;
366 
367             /* We are going to start with a power of floor_log_2_d - 1.  This works if works if e < 2**floor_log_2_d. */
368             if (e < (1UL << floor_log_2_d)) {
369                 /* This power works */
370                 more = cast(ubyte)(floor_log_2_d - 1);
371             }
372             else {
373                 // We need to go one higher.  This should not make proposed_m overflow, but it will make it
374                 // negative when interpreted as an int32_t.
375                 proposed_m += proposed_m;
376                 const uint64_t twice_rem = rem + rem;
377                 if (twice_rem >= absD || twice_rem < rem) proposed_m += 1;
378                 more = cast(ubyte)(floor_log_2_d | LIBDIVIDE_ADD_MARKER | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0));
379             }
380             proposed_m += 1;
381             this.more = more;
382             this.magic = (d < 0 ? -cast(int64_t)proposed_m : cast(int64_t)proposed_m);
383         }
384     }
385 
386     int64_t opBinaryRight(string op: "/")(int64_t numer) {
387         if (magic == 0) { //shift path
388             uint32_t shifter = more & LIBDIVIDE_64_SHIFT_MASK;
389             int64_t q = numer + ((numer >> 63) & ((1L << shifter) - 1));
390             q = q >> shifter;
391             int64_t shiftMask = cast(int8_t)(more >> 7); //must be arithmetic shift and then sign-extend
392             q = (q ^ shiftMask) - shiftMask;
393             return q;
394         }
395         else {
396             int64_t q = libdivide__mullhi_s64(magic, numer);
397             if (more & LIBDIVIDE_ADD_MARKER) {
398                 int64_t sign = cast(int8_t)(more >> 7); //must be arithmetic shift and then sign extend
399                 q += ((numer ^ sign) - sign);
400             }
401             q >>= more & LIBDIVIDE_64_SHIFT_MASK;
402             q += (q < 0);
403             return q;
404         }
405     }
406 }
407 
408 struct U64Denominator {
409     alias Type = typeof(magic);
410     uint64_t magic;
411     uint8_t more;
412 
413     this(uint64_t d) {
414         assert (d > 0, "d==0");
415         if ((d & (d - 1)) == 0) {
416             this.more = cast(uint8_t)(libdivide__count_trailing_zeros64(d) | LIBDIVIDE_U64_SHIFT_PATH);
417             this.magic = 0;
418         }
419         else {
420             const uint32_t floor_log_2_d = 63 - libdivide__count_leading_zeros64(d);
421 
422             uint64_t proposed_m, rem;
423             uint8_t more;
424             proposed_m = libdivide_128_div_64_to_64(1UL << floor_log_2_d, 0, d, &rem); //== (1 << (64 + floor_log_2_d)) / d
425 
426             assert(rem > 0 && rem < d);
427             const uint64_t e = d - rem;
428 
429             /* This power works if e < 2**floor_log_2_d. */
430             if (e < (1UL << floor_log_2_d)) {
431                 /* This power works */
432                 more = cast(uint8_t)floor_log_2_d;
433             }
434             else {
435                 // We have to use the general 65-bit algorithm.  We need to compute (2**power) / d. However,
436                 // we already have (2**(power-1))/d and its remainder.  By doubling both, and then correcting
437                 // the remainder, we can compute the larger division.
438                 proposed_m += proposed_m; //don't care about overflow here - in fact, we expect it
439                 const uint64_t twice_rem = rem + rem;
440                 if (twice_rem >= d || twice_rem < rem) proposed_m += 1;
441                 more = cast(uint8_t)(floor_log_2_d | LIBDIVIDE_ADD_MARKER);
442             }
443             this.magic = 1 + proposed_m;
444             this.more = more;
445             //result.more's shift should in general be ceil_log_2_d.  But if we used the smaller power, we subtract
446             //one from the shift because we're using the smaller power. If we're using the larger power, we subtract
447             //one from the shift because it's taken care of by the add indicator.  So floor_log_2_d happens to be
448             //correct in both cases, which is why we do it outside of the if statement.
449         }
450     }
451 
452     uint64_t opBinaryRight(string op: "/")(uint64_t numer) {
453         if (more & LIBDIVIDE_U64_SHIFT_PATH) {
454             return numer >> (more & LIBDIVIDE_64_SHIFT_MASK);
455         }
456         else {
457             uint64_t q = libdivide__mullhi_u64(magic, numer);
458             if (more & LIBDIVIDE_ADD_MARKER) {
459                 uint64_t t = ((numer - q) >> 1) + q;
460                 return t >> (more & LIBDIVIDE_64_SHIFT_MASK);
461             }
462             else {
463                 return q >> more; //all upper bits are 0 - don't need to mask them off
464             }
465         }
466     }
467 }
468 
469 auto denominator(T)(T value) {
470     static if (is(T == uint32_t)) {
471         return U32Denominator(value);
472     }
473     else static if (is(T == int32_t)) {
474         return S32Denominator(value);
475     }
476     else static if (is(T == uint64_t)) {
477         return U64Denominator(value);
478     }
479     else static if (is(T == int64_t)) {
480         return S64Denominator(value);
481     }
482     else {
483         static assert (false, "T must be an int, uint, long, ulong, not " ~ T.stringof);
484     }
485 }
486 
487 
488 unittest {
489     static assert (1000 / U32Denominator(31) == 32);
490     assert (1000 / U32Denominator(31) == 32);
491     assert (1000 / S32Denominator(50) == 20);
492     assert (1000 / denominator(11) == 90);
493 }
494 
495 unittest {
496     import std.random;
497     import std..string;
498     import std.typetuple;
499 
500     int counter;
501     alias denominators = TypeTuple!(S32Denominator, U32Denominator, S64Denominator, U64Denominator);
502     enum tests = 5000;
503 
504     foreach(D; denominators) {
505         foreach(j; 0 .. tests) {
506             D.Type d = uniform(1, D.Type.max);
507             D.Type n = uniform(D.Type.min, D.Type.max);
508             D.Type q = n / d;
509 
510             D d2 = D(d);
511             D.Type q2 = n / d2;
512             assert (q == q2, "%s/%s = %s, not %s".format(n, d, q, q2));
513             counter++;
514         }
515     }
516     assert(counter == denominators.length * tests, "counter=%s".format(counter));
517 }
518 
519 //void main(){}