diff --git a/main.cpp b/main.cpp
index 0dae627..4fbc28b 100644
--- a/main.cpp
+++ b/main.cpp
@@ -23,7 +23,7 @@
 //#define NO_OPENCL
 
 // If defined, use double floating-point instead of fixed-point.
-#define USE_DOUBLE
+//#define USE_DOUBLE
 
 #include <atomic>
 #include <chrono>
diff --git a/opencl/mandelbrot_calc_r128.c b/opencl/mandelbrot_calc_r128.c
index 35df1e2..186a53c 100644
--- a/opencl/mandelbrot_calc_r128.c
+++ b/opencl/mandelbrot_calc_r128.c
@@ -36,21 +36,23 @@ inline ulong2 r128__umul128(ulong a, ulong b)
     return dst;
 }
 
-inline ulong r128__umul128Hi(ulong a, ulong b)
+inline ulong2 r128__umul128S(ulong a)
 {
     ulong alo = a & 0xFFFFFFFF;
     ulong ahi = a >> 32;
-    ulong blo = b & 0xFFFFFFFF;
-    ulong bhi = b >> 32;
-    ulong p0, p1, p2, p3;
+    ulong p0, p1, p3;
 
-    p0 = alo * blo;
-    p1 = alo * bhi;
-    p2 = ahi * blo;
-    p3 = ahi * bhi;
+    p0 = alo * alo;
+    p1 = alo * ahi;
+    p3 = ahi * ahi;
 
-    ulong carry = ((p1 & 0xFFFFFFFF) + (p2 & 0xFFFFFFFF) + (p0 >> 32)) >> 32;
-    return p3 + ((uint)(p1 >> 32) + (uint)(p2 >> 32)) + carry;
+    ulong2 dst;
+    ulong carry;
+    carry = ((p1 & 0xFFFFFFFF) * 2 + (p0 >> 32)) >> 32;
+
+    dst.lo = p0 + (p1 << 33);
+    dst.hi = p3 + (uint)(p1 >> 31) + carry;
+    return dst;
 }
 
 inline ulong2 r128Shr(const ulong2 src, int amount)
@@ -63,17 +65,16 @@ inline ulong2 r128Shr(const ulong2 src, int amount)
 
 inline ulong2 r128__umul(const ulong2 a, const ulong2 b)
 {
-   ulong2 ahbl, albh, ahbh, sum;
+   ulong2 ahbl, ahbh, sum;
 
-   sum.lo = r128__umul128Hi(a.lo, b.lo);
+   sum.lo = r128__umul128(a.lo, b.lo).hi;
    ahbl = r128__umul128(a.hi, b.lo);
-   albh = r128__umul128(a.lo, b.hi);
    ahbh = r128__umul128(a.hi, b.hi);
 
    ahbh = r128Shr(ahbh, 60);
    sum.hi = ahbh.lo;
    sum = r128Add(sum, ahbl);
-   sum = r128Add(sum, albh);
+   sum = r128Add(sum, ahbl);
 
    return sum;
 }
@@ -87,41 +88,48 @@ inline ulong2 r128Mul(const ulong2 a, const ulong2 b)
    tb = b;
 
    if ((long)ta.hi < 0) {
-      if (ta.lo) {
-         ta.lo = ~ta.lo + 1;
-         ta.hi = ~ta.hi;
-      } else {
-         ta.lo = 0;
-         ta.hi = ~ta.hi + 1;
-      }
+      ta.lo = ~ta.lo + 1;
+      ta.hi = ~ta.hi;
       sign = !sign;
    }
    if ((long)tb.hi < 0) {
-      if (tb.lo) {
-         tb.lo = ~tb.lo + 1;
-         tb.hi = ~tb.hi;
-      } else {
-         tb.lo = 0;
-         tb.hi = ~tb.hi + 1;
-      }
+      tb.lo = ~tb.lo + 1;
+      tb.hi = ~tb.hi;
       sign = !sign;
    }
 
    tc = r128__umul(ta, tb);
 
    if (sign) {
-      if (tc.lo) {
-         tc.lo = ~tc.lo + 1;
-         tc.hi = ~tc.hi;
-      } else {
-         tc.lo = 0;
-         tc.hi = ~tc.hi + 1;
-      }
+      tc.lo = ~tc.lo + 1;
+      tc.hi = ~tc.hi;
    }
 
    return tc;
 }
 
+inline ulong2 r128Square(const ulong2 a)
+{
+   ulong2 ta = a;
+
+   if ((long)ta.hi < 0) {
+      ta.lo = ~ta.lo + 1;
+      ta.hi = ~ta.hi;
+   }
+
+   ulong2 ahbl, ahbh, sum;
+
+   sum.lo = r128__umul128S(ta.lo).hi;
+   ahbl = r128__umul128(ta.hi, ta.lo);
+   ahbh = r128__umul128S(ta.hi);
+
+   ahbh = r128Shr(ahbh, 60);
+   sum.hi = ahbh.lo;
+   sum = r128Add(sum, ahbl);
+   sum = r128Add(sum, ahbl);
+   return sum;
+}
+
 __kernel void mandelbrot_calc(const __global ulong4 *c_pt,
                               __global unsigned int *out_it,
                               const unsigned int max_iterations)
@@ -135,8 +143,8 @@ __kernel void mandelbrot_calc(const __global ulong4 *c_pt,
 
     for (iterations = 0; iterations < max_iterations; ++iterations) {
         tmp = r128Mul(pt.lo, pt.hi);
-        pt.lo = r128Mul(pt.lo, pt.lo);
-        pt.hi = r128Mul(pt.hi, pt.hi);
+        pt.lo = r128Square(pt.lo);
+        pt.hi = r128Square(pt.hi);
 
         const ulong2 sum = r128Add(pt.lo, pt.hi);
         if ((long)sum.hi >= 0x4000000000000000)