diff --git a/main.cpp b/main.cpp index 0dae627..4fbc28b 100644 --- a/main.cpp +++ b/main.cpp @@ -23,7 +23,7 @@ //#define NO_OPENCL // If defined, use double floating-point instead of fixed-point. -#define USE_DOUBLE +//#define USE_DOUBLE #include #include diff --git a/opencl/mandelbrot_calc_r128.c b/opencl/mandelbrot_calc_r128.c index 35df1e2..186a53c 100644 --- a/opencl/mandelbrot_calc_r128.c +++ b/opencl/mandelbrot_calc_r128.c @@ -36,21 +36,23 @@ inline ulong2 r128__umul128(ulong a, ulong b) return dst; } -inline ulong r128__umul128Hi(ulong a, ulong b) +inline ulong2 r128__umul128S(ulong a) { ulong alo = a & 0xFFFFFFFF; ulong ahi = a >> 32; - ulong blo = b & 0xFFFFFFFF; - ulong bhi = b >> 32; - ulong p0, p1, p2, p3; + ulong p0, p1, p3; - p0 = alo * blo; - p1 = alo * bhi; - p2 = ahi * blo; - p3 = ahi * bhi; + p0 = alo * alo; + p1 = alo * ahi; + p3 = ahi * ahi; - ulong carry = ((p1 & 0xFFFFFFFF) + (p2 & 0xFFFFFFFF) + (p0 >> 32)) >> 32; - return p3 + ((uint)(p1 >> 32) + (uint)(p2 >> 32)) + carry; + ulong2 dst; + ulong carry; + carry = ((p1 & 0xFFFFFFFF) * 2 + (p0 >> 32)) >> 32; + + dst.lo = p0 + (p1 << 33); + dst.hi = p3 + (uint)(p1 >> 31) + carry; + return dst; } inline ulong2 r128Shr(const ulong2 src, int amount) @@ -63,17 +65,16 @@ inline ulong2 r128Shr(const ulong2 src, int amount) inline ulong2 r128__umul(const ulong2 a, const ulong2 b) { - ulong2 ahbl, albh, ahbh, sum; + ulong2 ahbl, ahbh, sum; - sum.lo = r128__umul128Hi(a.lo, b.lo); + sum.lo = r128__umul128(a.lo, b.lo).hi; ahbl = r128__umul128(a.hi, b.lo); - albh = r128__umul128(a.lo, b.hi); ahbh = r128__umul128(a.hi, b.hi); ahbh = r128Shr(ahbh, 60); sum.hi = ahbh.lo; sum = r128Add(sum, ahbl); - sum = r128Add(sum, albh); + sum = r128Add(sum, ahbl); return sum; } @@ -87,41 +88,48 @@ inline ulong2 r128Mul(const ulong2 a, const ulong2 b) tb = b; if ((long)ta.hi < 0) { - if (ta.lo) { - ta.lo = ~ta.lo + 1; - ta.hi = ~ta.hi; - } else { - ta.lo = 0; - ta.hi = ~ta.hi + 1; - } + ta.lo = ~ta.lo + 1; + ta.hi = ~ta.hi; sign = !sign; } if ((long)tb.hi < 0) { - if (tb.lo) { - tb.lo = ~tb.lo + 1; - tb.hi = ~tb.hi; - } else { - tb.lo = 0; - tb.hi = ~tb.hi + 1; - } + tb.lo = ~tb.lo + 1; + tb.hi = ~tb.hi; sign = !sign; } tc = r128__umul(ta, tb); if (sign) { - if (tc.lo) { - tc.lo = ~tc.lo + 1; - tc.hi = ~tc.hi; - } else { - tc.lo = 0; - tc.hi = ~tc.hi + 1; - } + tc.lo = ~tc.lo + 1; + tc.hi = ~tc.hi; } return tc; } +inline ulong2 r128Square(const ulong2 a) +{ + ulong2 ta = a; + + if ((long)ta.hi < 0) { + ta.lo = ~ta.lo + 1; + ta.hi = ~ta.hi; + } + + ulong2 ahbl, ahbh, sum; + + sum.lo = r128__umul128S(ta.lo).hi; + ahbl = r128__umul128(ta.hi, ta.lo); + ahbh = r128__umul128S(ta.hi); + + ahbh = r128Shr(ahbh, 60); + sum.hi = ahbh.lo; + sum = r128Add(sum, ahbl); + sum = r128Add(sum, ahbl); + return sum; +} + __kernel void mandelbrot_calc(const __global ulong4 *c_pt, __global unsigned int *out_it, const unsigned int max_iterations) @@ -135,8 +143,8 @@ __kernel void mandelbrot_calc(const __global ulong4 *c_pt, for (iterations = 0; iterations < max_iterations; ++iterations) { tmp = r128Mul(pt.lo, pt.hi); - pt.lo = r128Mul(pt.lo, pt.lo); - pt.hi = r128Mul(pt.hi, pt.hi); + pt.lo = r128Square(pt.lo); + pt.hi = r128Square(pt.hi); const ulong2 sum = r128Add(pt.lo, pt.hi); if ((long)sum.hi >= 0x4000000000000000)