From e9d329fe688230b46792088ac42fd04fc8cccddb Mon Sep 17 00:00:00 2001
From: Clyne Sullivan <clyne@bitgloo.com>
Date: Sat, 23 Jul 2022 20:38:35 -0400
Subject: [PATCH] add USE_DOUBLE option; remove duplicate files

---
 main.cpp        |   47 +-
 opencl/Makefile |    2 -
 opencl/main.cpp |  494 -----------
 opencl/r128.h   | 2210 -----------------------------------------------
 4 files changed, 39 insertions(+), 2714 deletions(-)
 delete mode 100644 opencl/Makefile
 delete mode 100644 opencl/main.cpp
 delete mode 100644 opencl/r128.h
diff --git a/main.cpp b/main.cpp
index 9ad0ffc..0dae627 100644
--- a/main.cpp
+++ b/main.cpp
@@ -22,6 +22,9 @@
 // If defined, split calculations across CPU threads instead of using OpenCL.
 //#define NO_OPENCL
 
+// If defined, use double floating-point instead of fixed-point.
+#define USE_DOUBLE
+
 #include <atomic>
 #include <chrono>
 #include <cstdint>
@@ -39,6 +42,15 @@
 
 #include <SDL2/SDL.h>
 
+// Sets the window's dimensions. The window is square.
+constexpr static int WIN_DIM = 800;
+
+#ifdef USE_DOUBLE
+#define FRACTAL_KERNEL "opencl/mandelbrot_calc.c"
+#else
+#define FRACTAL_KERNEL "opencl/mandelbrot_calc_r128.c"
+#endif
+
 #ifndef NO_OPENCL
 // Include OpenCL libraries if they're required.
 #define CL_HPP_TARGET_OPENCL_VERSION (300)
@@ -48,6 +60,21 @@
 // Define helper types and functions to allow direct inclusion of the kernel.
 #include <map>
 
+struct double2 {
+    double x;
+    double y;
+
+    auto& operator+=(const double2& other) {
+        x += other.x;
+        y += other.y;
+        return *this;
+    }
+    auto& operator*=(const double2& other) {
+        x *= other.x;
+        y *= other.y;
+        return *this;
+    }
+} __attribute__ ((packed));
 struct ulong2 {
     uint64_t lo;
     uint64_t hi;
@@ -60,16 +87,18 @@ struct ulong4 {
 #define __kernel
 #define __global
 #define get_global_id(x) (globalIds[std::this_thread::get_id()])
+#ifdef USE_DOUBLE
+#define CALC_SRC_T double2
+#else
+#define CALC_SRC_T ulong4
+#endif
 
 static std::map<std::thread::id, unsigned int> globalIds;
 static std::array<uint32_t, WIN_DIM * WIN_DIM> renderOutput;
 
-#include "opencl/mandelbrot_calc_r128.c"
+#include FRACTAL_KERNEL
 #endif
 
-// Sets the window's dimensions. The window is square.
-constexpr static int WIN_DIM = 800;
-
 // For non-OpenCL rendering, the number of threads to split work across.
 constexpr static int THREAD_COUNT = 8;
 
@@ -77,11 +106,13 @@ constexpr static int THREAD_COUNT = 8;
 // Can use native float or double; or, a custom Q4.124 fixed-point data type.
 // If fixed point, use "*_r128.c" OpenCL kernel. Otherwise, use regular kernel.
 
+#ifdef USE_DOUBLE
+using Float = double;
+#else
 #define R128_IMPLEMENTATION
 #include "r128.h"
 using Float = R128;
-
-//using Float = double;
+#endif
 
 // Not allowed to calculate less iterations than this.
 constexpr uint32_t MIN_MAX_ITERATIONS = 70;
@@ -169,7 +200,7 @@ int main(int argc, char **argv)
     initSDL(&window, &renderer, &MandelbrotTexture);
 
 #ifndef NO_OPENCL
-    std::ifstream clSource ("opencl/mandelbrot_calc_r128.c");
+    std::ifstream clSource (FRACTAL_KERNEL);
     if (!clSource.good())
         throw std::runtime_error("Failed to open OpenCL kernel!");
 
@@ -557,7 +588,7 @@ void MandelbrotState::calculateBitmap()
         execs.emplace_back([this, t] {
             for (size_t i = t * renderOutput.size() / THREAD_COUNT; i < (t + 1) * renderOutput.size() / THREAD_COUNT; ++i) {
                 globalIds.insert_or_assign(std::this_thread::get_id(), i);
-                mandelbrot_calc((ulong4 *)points.data(), renderOutput.data(), m_max_iterations);
+                mandelbrot_calc((CALC_SRC_T *)points.data(), renderOutput.data(), m_max_iterations);
             }});
     }
 
diff --git a/opencl/Makefile b/opencl/Makefile
deleted file mode 100644
index 254df05..0000000
--- a/opencl/Makefile
+++ /dev/null
@@ -1,2 +0,0 @@
-all: main.cpp
-	g++ main.cpp -std=c++20 -lSDL2 -lpthread -lOpenCL -g3 -ggdb -O0
diff --git a/opencl/main.cpp b/opencl/main.cpp
deleted file mode 100644
index 185960b..0000000
--- a/opencl/main.cpp
+++ /dev/null
@@ -1,494 +0,0 @@
-// fractal - OpenCL-accelerated Mandelbrot renderer.
-// Written by Clyne Sullivan.
-
-// If defined, program auto-zooms and measures runtime.
-//#define BENCHMARK
-
-#include <atomic>
-#include <chrono>
-#include <cstdint>
-#include <cstring>
-#include <fstream>
-#include <iomanip>
-#include <iostream>
-#include <memory>
-#include <sstream>
-#include <stdexcept>
-#include <thread>
-#include <vector>
-
-#define CL_HPP_TARGET_OPENCL_VERSION (300)
-#define CL_HPP_ENABLE_EXCEPTIONS (1)
-#include <CL/opencl.hpp>
-#include <SDL2/SDL.h>
-
-// The "Float" type determines what data type will store numbers for calculations.
-// Can use native float or double; or, a custom Q4.124 fixed-point data type.
-// If fixed point, use "*_r128.c" OpenCL kernel. Otherwise, use regular kernel.
-
-#define R128_IMPLEMENTATION
-#include "r128.h"
-using Float = R128;
-
-//using Float = double;
-
-// Sets the window's dimensions. The window is square.
-constexpr static int WIN_DIM = 800;
-
-// Not allowed to calculate less iterations than this.
-constexpr uint32_t MIN_MAX_ITERATIONS = 70;
-// Not allowed to zoom out farther than this.
-static const Float MIN_ZOOM (4.0);
-
-/**
- * A packed Float-pair for storing complex numbers.
- * Must match a vector type for OpenCL.
- */
-struct Complex {
-    Float real = Float(0);
-    Float imag = Float(0);
-} __attribute__ ((packed));
-
-class MandelbrotState
-{
-public:
-    // Initializes data and spawns the calculation thread.
-    MandelbrotState();
-    // Joins threads.
-    ~MandelbrotState();
-
-    // Prepares to use the given OpenCL kernel for calculations.
-    void initKernel(cl::Context& clcontext, cl::Program& clprogram, const char *kernelname);
-
-    Float zoom() const;
-
-    // Offsets the view's origin by the given Complex, and changes zoom by the given factor.
-    // Returns true if a new calculation has been scheduled (false if one is in progress).
-    bool moveOriginAndZoomBy(Complex c, Float z);
-
-    // Outputs the results of the latest calculation into the given SDL texture.
-    // Returns true if successful.
-    // Returns false if a calculation is in progress. The texture is not updated.
-    bool intoTexture(SDL_Texture *texture);
-    // Requests the initiation of a new calculation.
-    void scheduleRecalculation();
-
-private:
-    std::thread m_calc_thread;
-    std::atomic_bool m_calcing; // If false, we're ready for a new calculation.
-    std::atomic_flag m_recalc;  // Tell calcThread to recalc, or tell main thread recalcing is done.
-    uint32_t m_max_iterations;
-    Float m_zoom;
-    Complex m_origin;
-
-    std::unique_ptr<cl::Kernel> m_cl_kernel;
-    std::unique_ptr<cl::CommandQueue> m_cl_queue;
-    std::unique_ptr<cl::Buffer> m_cl_input;
-    std::unique_ptr<cl::Buffer> m_cl_output;
-
-    // Enters main loop of calcThread.
-    void calcThread();
-    // Calls the OpenCL kernel to compute new results.
-    void calculateBitmap();
-
-    // Determine the max iteration count based on zoom factor.
-    static uint32_t calculateMaxIterations(Float zoom);
-};
-
-static bool done = false;
-static std::atomic_int fps = 0;
-static std::chrono::time_point<std::chrono::high_resolution_clock> clTime;
-
-static cl::Context initCLContext();
-static cl::Program initCLProgram(cl::Context&, const char * const);
-static void initSDL(SDL_Window **, SDL_Renderer **, SDL_Texture **);
-static void threadFpsMonitor(MandelbrotState&);
-static void threadEventMonitor(MandelbrotState&);
-
-int main(int argc, char **argv)
-{
-    MandelbrotState Mandelbrot;
-    SDL_Window *window;
-    SDL_Renderer *renderer;
-    SDL_Texture *MandelbrotTexture;
-
-    initSDL(&window, &renderer, &MandelbrotTexture);
-
-    std::ifstream clSource ("opencl/mandelbrot_calc_r128.c");
-    if (!clSource.good())
-        throw std::runtime_error("Failed to open OpenCL kernel!");
-
-    // Dump OpenCL kernel into a std::string.
-    std::ostringstream oss;
-    oss << clSource.rdbuf();
-    std::string clSourceStr (oss.str());
-
-    auto clContext = initCLContext();
-    auto clProgram = initCLProgram(clContext, clSourceStr.data());
-    Mandelbrot.initKernel(clContext, clProgram, "mandelbrot_calc");
-
-    // Initiate first calculation so something appears on the screen.
-    Mandelbrot.scheduleRecalculation();
-
-    std::thread fpsMonitor ([&Mandelbrot] { threadFpsMonitor(Mandelbrot); });
-    std::thread eventMonitor ([&Mandelbrot] { threadEventMonitor(Mandelbrot); });
-
-#ifdef BENCHMARK
-    auto start = std::chrono::high_resolution_clock::now();
-#endif
-
-    while (!done) {
-        if (Mandelbrot.intoTexture(MandelbrotTexture)) {
-            SDL_RenderClear(renderer);
-            SDL_RenderCopy(renderer, MandelbrotTexture, nullptr, nullptr);
-            SDL_RenderPresent(renderer);
-
-            ++fps;
-        } else {
-            std::this_thread::sleep_for(std::chrono::microseconds(10));
-        }
-
-#ifdef BENCHMARK
-        if (Mandelbrot.zoom() < Float(1e-5))
-            done = true;
-#endif
-    }
-
-#ifdef BENCHMARK
-    std::chrono::duration<double> seconds = std::chrono::high_resolution_clock::now() - start;
-    std::cout << "Calculations took: " << seconds.count() << "s" << std::endl;
-#endif
-
-    eventMonitor.join();
-    fpsMonitor.join();
-    SDL_DestroyRenderer(renderer);
-    return 0;
-}
-
-
-static cl::Platform clplatform;
-static std::vector<cl::Device> cldevices;
-
-cl::Context initCLContext()
-{
-    clplatform = cl::Platform::getDefault();
-    clplatform.getDevices(CL_DEVICE_TYPE_GPU, &cldevices);
-
-    return cl::Context(cldevices.front());
-}
-
-cl::Program initCLProgram(cl::Context& clcontext, const char * const source)
-{
-    cl::Program *prog;
-
-    try {
-        prog = new cl::Program(clcontext, source);
-        prog->build();
-        return *prog;
-    } catch (const cl::Error& err) {
-        const auto& dev = cldevices.front();
-        std::cout << "Build Log: " << prog->getBuildInfo<CL_PROGRAM_BUILD_LOG>(dev) << std::endl;
-        throw err;
-    }
-}
-
-void initSDL(SDL_Window **window, SDL_Renderer **renderer, SDL_Texture **texture)
-{
-    /* Enable standard application logging */
-    SDL_LogSetPriority(SDL_LOG_CATEGORY_APPLICATION, SDL_LOG_PRIORITY_INFO);
-
-    if (SDL_Init(SDL_INIT_VIDEO) < 0) {
-        SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "Couldn't initialize SDL: %s\n", SDL_GetError());
-        throw std::runtime_error("initSDL failed!");
-    }
-
-    *window = SDL_CreateWindow("Happy Mandelbrot",
-                               SDL_WINDOWPOS_UNDEFINED,
-                               SDL_WINDOWPOS_UNDEFINED,
-                               WIN_DIM, WIN_DIM,
-                               SDL_WINDOW_RESIZABLE);
-    if (!*window) {
-        SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "Couldn't set create window: %s\n", SDL_GetError());
-        throw std::runtime_error("initSDL failed!");
-    }
-
-    *renderer = SDL_CreateRenderer(*window, -1, 0);
-    if (!*renderer) {
-        SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "Couldn't set create renderer: %s\n", SDL_GetError());
-        throw std::runtime_error("initSDL failed!");
-    }
-
-    SDL_GL_SetSwapInterval(0);
-
-    *texture = SDL_CreateTexture(*renderer, SDL_PIXELFORMAT_ARGB8888, SDL_TEXTUREACCESS_STREAMING, WIN_DIM, WIN_DIM);
-    if (!*texture) {
-        SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "Couldn't set create texture: %s\n", SDL_GetError());
-        throw std::runtime_error("initSDL failed!");
-    }
-
-    atexit(SDL_Quit);
-}
-
-void threadFpsMonitor(MandelbrotState& Mandelbrot)
-{
-    while (!done) {
-        std::cout << "Rendered FPS: " << fps.load() << ", Z: " << (double)Mandelbrot.zoom() << std::endl;
-        fps.store(0);
-        std::this_thread::sleep_for(std::chrono::seconds(1));
-    }
-}
-
-void threadEventMonitor(MandelbrotState& Mandelbrot)
-{
-    Float zfactor (1.03);
-    Float zooming (1);
-    Complex newoffset;
-
-    while (!done) {
-        auto next = std::chrono::steady_clock::now() + std::chrono::milliseconds(17);
-
-        for (SDL_Event event; SDL_PollEvent(&event);) {
-            switch (event.type) {
-            case SDL_KEYDOWN:
-                if (event.key.keysym.sym == SDLK_ESCAPE)
-                    done = true;
-                break;
-            case SDL_MOUSEBUTTONDOWN:
-                // Calculate desired "normal" change from origin. -0.5 to 0.5.
-                // Zoom scales this result later.
-                newoffset = Complex {
-                    Float(event.button.x / (double)WIN_DIM) - Float(0.5),
-                    Float(event.button.y / (double)WIN_DIM) - Float(0.5)
-                };
-
-                // Zoom in with left button, zoom out with right.
-                if (event.button.button == SDL_BUTTON_LEFT)
-                    zooming *= Float(1) - (zfactor - Float(1));
-                else if (event.button.button == SDL_BUTTON_RIGHT)
-                    zooming = zfactor;
-                break;
-            case SDL_MOUSEBUTTONUP:
-                // Stop moving.
-                zooming = 1;
-                newoffset.real = 0;
-                newoffset.imag = 0;
-                break;
-            case SDL_MOUSEMOTION:
-                // Update offset on mouse movement, so zoom continues towards where the user expects.
-                if (zooming != Float(1)) {
-                    newoffset.real += Float(event.motion.xrel / (double)WIN_DIM);
-                    newoffset.imag += Float(event.motion.yrel / (double)WIN_DIM);
-                }
-                break;
-            case SDL_MOUSEWHEEL:
-                // Increase zoom factor with scroll up, or decrease with scroll down.
-                zfactor = std::max(Float(1), zfactor + Float(0.005) * Float(event.wheel.y));
-
-                // Update zoom speed if zfactor is changed while zooming.
-                if (zooming != Float(1)) {
-                    if (zooming < Float(1))
-                        zooming *= Float(1) - (zfactor - Float(1));
-                    else
-                        zooming = zfactor;
-                }
-                break;
-            case SDL_QUIT:
-                done = true;
-                break;
-            }
-        }
-
-#ifdef BENCHMARK
-        // Constant zoom on a constant point.
-        bool b = Mandelbrot.moveOriginAndZoomBy({}, Float(1) - (zfactor - Float(1)));
-        std::this_thread::sleep_for(std::chrono::milliseconds(b ? 8 : 1)); // max 125fps
-#else
-        if (zooming != Float(1) || newoffset.real != Float(0) || newoffset.imag != Float(0)) {
-            // Scale new offset according to zoom.
-            const auto zoom = Mandelbrot.zoom();
-            Complex c = newoffset;
-            c.real *= zoom * (Float(1) - zooming);
-            c.imag *= zoom * (Float(1) - zooming);
-
-            // Don't sleep as long if a recalculation is already running, so
-            // we can get our new request in sooner once recalculation completes.
-            if (Mandelbrot.moveOriginAndZoomBy(c, zooming))
-                std::this_thread::sleep_until(next);
-            else
-                std::this_thread::sleep_for(std::chrono::microseconds(50));
-        } else {
-            std::this_thread::sleep_until(next);
-        }
-#endif
-    }
-}
-
-MandelbrotState::MandelbrotState():
-    m_calcing(false),
-    m_max_iterations(MIN_MAX_ITERATIONS),
-    m_zoom(MIN_ZOOM)
-{
-#ifndef BENCHMARK
-    // This is a good starting point.
-    m_origin.real = -1;
-    m_origin.imag = 0;
-#else
-    m_origin.real = -1.5;
-    m_origin.imag = 0;
-#endif
-
-    // Spawn calcThread to begin receiving recalculation requests.
-    m_recalc.clear();
-    m_calc_thread = std::thread([this] { calcThread(); });
-}
-
-MandelbrotState::~MandelbrotState() {
-    // calcThread is likely waiting for m_recalc to become true.
-    m_recalc.test_and_set();
-    m_recalc.notify_all();
-
-    // Bring calcThread in if it's still running.
-    if (m_calc_thread.joinable())
-        m_calc_thread.join();
-}
-
-void MandelbrotState::initKernel(cl::Context& clcontext, cl::Program& clprogram, const char *kernelname)
-{
-    m_cl_kernel.reset(new cl::Kernel(clprogram, "mandelbrot_calc"));
-    m_cl_queue.reset(new cl::CommandQueue(clcontext));
-    m_cl_input.reset(new cl::Buffer(clcontext, CL_MEM_READ_ONLY, WIN_DIM * WIN_DIM * sizeof(Complex)));
-    m_cl_output.reset(new cl::Buffer(clcontext, CL_MEM_WRITE_ONLY, WIN_DIM * WIN_DIM * sizeof(uint32_t)));
-
-    // These kernel parameters do not change throughout execution.
-    // Max iteration count does, and is set with each kernel execution.
-    m_cl_kernel->setArg(0, *m_cl_input);
-    m_cl_kernel->setArg(1, *m_cl_output);
-}
-
-Float MandelbrotState::zoom() const {
-    return m_zoom;
-}
-
-bool MandelbrotState::moveOriginAndZoomBy(Complex c, Float z) {
-    if (!m_calcing) {
-        m_origin.real += c.real;
-        m_origin.imag += c.imag;
-        m_zoom = std::min(MIN_ZOOM, m_zoom * z);
-        m_max_iterations = std::max(MIN_MAX_ITERATIONS, calculateMaxIterations(m_zoom));
-
-        scheduleRecalculation();
-    }
-
-    return !m_calcing;
-}
-
-bool MandelbrotState::intoTexture(SDL_Texture *texture) {
-    if (m_calcing) {
-        // Wait for the calculations to complete.
-        m_recalc.wait(true);
-
-        // Lock the SDL texture, then stream the OpenCL output into it.
-        void *dst;
-        int pitch;
-        SDL_LockTexture(texture, nullptr, &dst, &pitch);
-        m_cl_queue->enqueueReadBuffer(*m_cl_output, CL_TRUE, 0, WIN_DIM * WIN_DIM * sizeof(uint32_t), dst);
-        SDL_UnlockTexture(texture);
-
-        std::chrono::duration<double> diff = 
-            std::chrono::high_resolution_clock::now() - clTime;
-        std::cout << "Time: " << diff.count() << "s" << std::endl;
-
-        // Allow user input to modify origin and zoom,
-        // also allowing the next calculation to be scheduled.
-        m_calcing = false;
-        return true;
-    } else {
-        return false;
-    }
-}
-
-void MandelbrotState::scheduleRecalculation() {
-    if (!m_calcing) {
-        // Tell calcThread that it's time to recalculate.
-        m_recalc.test_and_set();
-        m_recalc.notify_one();
-    }
-}
-
-void MandelbrotState::calcThread() {
-    while (!done) {
-        // Wait for a recalculation to be requested, indicated by m_recalc becoming true.
-        m_recalc.wait(false);
-        calculateBitmap();
-
-        // Finished. Clear m_recalc, and notify the render thread (checked at MandelbrotState::intoTexture).
-        m_recalc.clear();
-        m_recalc.notify_one();
-    }
-}
-
-uint32_t MandelbrotState::calculateMaxIterations(Float zoom)
-{
-    // The max iteration count will increase linearly with zoom factor.
-    // TODO Does the result increase too quickly?
-
-    return MIN_MAX_ITERATIONS * (1.5 - std::log(static_cast<double>(zoom)) / std::log((double)MIN_ZOOM));
-}
-
-void MandelbrotState::calculateBitmap()
-{
-    static std::array<Complex, WIN_DIM * WIN_DIM> points;
-    static std::array<Float, WIN_DIM> row;
-    static std::array<Float, WIN_DIM> col;
-
-    //
-    // Generate a list of every Complex coordinate that needs to be calculated.
-
-    const Float dz (m_zoom * Float(1.0 / WIN_DIM));
-    Complex pt;
-    pt.real = m_origin.real - m_zoom * Float(0.5);
-    pt.imag = m_origin.imag - m_zoom * Float(0.5);
-
-    {
-        auto p = row.begin();
-        Float r = pt.real;
-        for (int i = 0; i < WIN_DIM; ++i) {
-            *p++ = r;
-            r += dz;
-        }
-    }
-
-    {
-        auto p = col.begin();
-        Float r = pt.imag;
-        for (int i = 0; i < WIN_DIM; ++i) {
-            *p++ = r;
-            r += dz;
-        }
-    }
-
-    auto ptr = points.begin();
-    for (int j = 0; j < WIN_DIM; ++j) {
-        Complex c;
-        c.imag = col[j];
-
-        for (int i = 0; i < WIN_DIM; ++i) {
-            c.real = row[i];
-            *ptr++ = c;
-        }
-    }
-
-    //
-    // Pass the list into the OpenCL kernel, and begin execution.
-
-    while (m_calcing)
-        std::this_thread::yield();
-
-    m_calcing = true;
-
-    clTime = std::chrono::high_resolution_clock::now();
-    m_cl_kernel->setArg(2, m_max_iterations);
-    m_cl_queue->enqueueWriteBuffer(*m_cl_input, CL_TRUE, 0, points.size() * sizeof(Complex), points.data());
-    m_cl_queue->enqueueNDRangeKernel(*m_cl_kernel, cl::NullRange, cl::NDRange(points.size()), cl::NullRange);
-}
-
diff --git a/opencl/r128.h b/opencl/r128.h
deleted file mode 100644
index 8fd7dae..0000000
--- a/opencl/r128.h
+++ /dev/null
@@ -1,2210 +0,0 @@
-/*
-r128.h: 128-bit (64.64) signed fixed-point arithmetic. Version 1.6.0
-
-COMPILATION
------------
-Drop this header file somewhere in your project and include it wherever it is
-needed. There is no separate .c file for this library. To get the code, in ONE
-file in your project, put:
-
-#define R128_IMPLEMENTATION
-
-before you include this file. You may also provide a definition for R128_ASSERT
-to force the library to use a custom assert macro.
-
-COMPILER/LIBRARY SUPPORT
-------------------------
-This library requires a C89 compiler with support for 64-bit integers. If your
-compiler does not support the long long data type, the R128_U64, etc. macros
-must be set appropriately. On x86 and x64 targets, Intel intrinsics are used
-for speed. If your compiler does not support these intrinsics, you can add
-#define R128_STDC_ONLY
-in your implementation file before including r128.h.
-
-The only C runtime library functionality used by this library is <assert.h>.
-This can be avoided by defining an R128_ASSERT macro in your implementation
-file. Since this library uses 64-bit arithmetic, this may implicitly add a
-runtime library dependency on 32-bit platforms.
-
-C++ SUPPORT
------------
-Operator overloads are supplied for C++ files that include this file. Since all
-C++ functions are declared inline (or static inline), the R128_IMPLEMENTATION
-file can be either C++ or C.
-
-LICENSE
--------
-This is free and unencumbered software released into the public domain.
-
-Anyone is free to copy, modify, publish, use, compile, sell, or
-distribute this software, either in source code form or as a compiled
-binary, for any purpose, commercial or non-commercial, and by any
-means.
-
-In jurisdictions that recognize copyright laws, the author or authors
-of this software dedicate any and all copyright interest in the
-software to the public domain. We make this dedication for the benefit
-of the public at large and to the detriment of our heirs and
-successors. We intend this dedication to be an overt act of
-relinquishment in perpetuity of all present and future rights to this
-software under copyright law.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
-OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
-ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-OTHER DEALINGS IN THE SOFTWARE.
-*/
-
-#ifndef H_R128_H
-#define H_R128_H
-
-#include <stddef.h>
-
-// 64-bit integer support
-// If your compiler does not have stdint.h, add appropriate defines for these macros.
-#if defined(_MSC_VER) && (_MSC_VER < 1600)
-#  define R128_S32 __int32
-#  define R128_U32 unsigned __int32
-#  define R128_S64 __int64
-#  define R128_U64 unsigned __int64
-#  define R128_LIT_S64(x) x##i64
-#  define R128_LIT_U64(x) x##ui64
-#else
-#  include <stdint.h>
-#  define R128_S32 int32_t
-#  define R128_U32 uint32_t
-#  define R128_S64 long long
-#  define R128_U64 unsigned long long
-#  define R128_LIT_S64(x) x##ll
-#  define R128_LIT_U64(x) x##ull
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef struct R128 {
-   R128_U64 lo;
-   R128_U64 hi;
-
-#ifdef __cplusplus
-   R128();
-   R128(double);
-   R128(int);
-   R128(R128_S64);
-   R128(R128_U64 low, R128_U64 high);
-
-   operator double() const;
-   operator R128_S64() const;
-   operator int() const;
-   operator bool() const;
-
-   bool operator!() const;
-   R128 operator~() const;
-   R128 operator-() const;
-   R128 &operator|=(const R128 &rhs);
-   R128 &operator&=(const R128 &rhs);
-   R128 &operator^=(const R128 &rhs);
-   R128 &operator+=(const R128 &rhs);
-   R128 &operator-=(const R128 &rhs);
-   R128 &operator*=(const R128 &rhs);
-   R128 &operator/=(const R128 &rhs);
-   R128 &operator%=(const R128 &rhs);
-   R128 &operator<<=(int amount);
-   R128 &operator>>=(int amount);
-#endif   //__cplusplus
-} __attribute__ ((packed)) R128;
-
-// Type conversion
-extern void r128FromInt(R128 *dst, R128_S64 v);
-extern void r128FromFloat(R128 *dst, double v);
-extern R128_S64 r128ToInt(const R128 *v);
-extern double r128ToFloat(const R128 *v);
-
-// Copy
-extern void r128Copy(R128 *dst, const R128 *src);
-
-// Sign manipulation
-extern void r128Neg(R128 *dst, const R128 *v);   // -v
-extern void r128Abs(R128* dst, const R128* v);   // abs(v)
-extern void r128Nabs(R128* dst, const R128* v);  // -abs(v)
-
-// Bitwise operations
-extern void r128Not(R128 *dst, const R128 *src);               // ~a
-extern void r128Or(R128 *dst, const R128 *a, const R128 *b);   // a | b
-extern void r128And(R128 *dst, const R128 *a, const R128 *b);  // a & b
-extern void r128Xor(R128 *dst, const R128 *a, const R128 *b);  // a ^ b
-extern void r128Shl(R128 *dst, const R128 *src, int amount);   // shift left by amount mod 128
-extern void r128Shr(R128 *dst, const R128 *src, int amount);   // shift right logical by amount mod 128
-extern void r128Sar(R128 *dst, const R128 *src, int amount);   // shift right arithmetic by amount mod 128
-
-// Arithmetic
-extern void r128Add(R128 *dst, const R128 *a, const R128 *b);  // a + b
-extern void r128Sub(R128 *dst, const R128 *a, const R128 *b);  // a - b
-extern void r128Mul(R128 *dst, const R128 *a, const R128 *b);  // a * b
-extern void r128Div(R128 *dst, const R128 *a, const R128 *b);  // a / b
-extern void r128Mod(R128 *dst, const R128 *a, const R128 *b);  // a - toInt(a / b) * b
-
-extern void r128Sqrt(R128 *dst, const R128 *v);  // sqrt(v)
-extern void r128Rsqrt(R128 *dst, const R128 *v); // 1 / sqrt(v)
-
-// Comparison
-extern int  r128Cmp(const R128 *a, const R128 *b);  // sign of a-b
-extern void r128Min(R128 *dst, const R128 *a, const R128 *b);
-extern void r128Max(R128 *dst, const R128 *a, const R128 *b);
-extern void r128Floor(R128 *dst, const R128 *v);
-extern void r128Ceil(R128 *dst, const R128 *v);
-extern void r128Round(R128 *dst, const R128 *v);    // round to nearest, rounding halfway values away from zero
-extern int  r128IsNeg(const R128 *v); // quick check for < 0
-
-// String conversion
-//
-typedef enum R128ToStringSign {
-   R128ToStringSign_Default,  // no sign character for positive values
-   R128ToStringSign_Space,    // leading space for positive values
-   R128ToStringSign_Plus,     // leading '+' for positive values
-} R128ToStringSign;
-
-// Formatting options for use with r128ToStringOpt. The "defaults" correspond
-// to a format string of "%f".
-//
-typedef struct R128ToStringFormat {
-   // sign character for positive values. Default is R128ToStringSign_Default.
-   R128ToStringSign sign;
-
-   // minimum number of characters to write. Default is 0.
-   int width;
-
-   // place to the right of the decimal at which rounding is performed. If negative,
-   // a maximum of 20 decimal places will be written, with no trailing zeroes.
-   // (20 places is sufficient to ensure that r128FromString will convert back to the
-   // original value.) Default is -1. NOTE: This is not the same default that the C
-   // standard library uses for %f.
-   int precision;
-
-   // If non-zero, pads the output string with leading zeroes if the final result is
-   // fewer than width characters. Otherwise, leading spaces are used. Default is 0.
-   int zeroPad;
-
-   // Always print a decimal point, even if the value is an integer. Default is 0.
-   int decimal;
-
-   // Left-align output if width specifier requires padding.
-   // Default is 0 (right align).
-   int leftAlign;
-} R128ToStringFormat;
-
-// r128ToStringOpt: convert R128 to a decimal string, with formatting.
-//
-// dst and dstSize: specify the buffer to write into. At most dstSize bytes will be written
-// (including null terminator). No additional rounding is performed if dstSize is not large
-// enough to hold the entire string.
-//
-// opt: an R128ToStringFormat struct (q.v.) with formatting options.
-//
-// Uses the R128_decimal global as the decimal point character.
-// Always writes a null terminator, even if the destination buffer is not large enough.
-//
-// Number of bytes that will be written (i.e. how big does dst need to be?):
-// If width is specified: width + 1 bytes.
-// If precision is specified: at most precision + 22 bytes.
-// If neither is specified: at most 42 bytes.
-//
-// Returns the number of bytes that would have been written if dst was sufficiently large,
-// not including the final null terminator.
-//
-extern int r128ToStringOpt(char *dst, size_t dstSize, const R128 *v, const R128ToStringFormat *opt);
-
-// r128ToStringf: convert R128 to a decimal string, with formatting.
-//
-// dst and dstSize: specify the buffer to write into. At most dstSize bytes will be written
-// (including null terminator).
-//
-// format: a printf-style format specifier, as one would use with floating point types.
-//    e.g. "%+5.2f". (The leading % and trailing f are optional.)
-//    NOTE: This is NOT a full replacement for sprintf. Any characters in the format string
-//       that do not correspond to a format placeholder are ignored.
-//
-// Uses the R128_decimal global as the decimal point character.
-// Always writes a null terminator, even if the destination buffer is not large enough.
-//
-// Number of bytes that will be written (i.e. how big does dst need to be?):
-// If the precision field is specified: at most max(width, precision + 21) + 1 bytes
-// Otherwise: at most max(width, 41) + 1 bytes.
-//
-// Returns the number of bytes that would have been written if dst was sufficiently large,
-// not including the final null terminator.
-//
-extern int r128ToStringf(char *dst, size_t dstSize, const char *format, const R128 *v);
-
-// r128ToString: convert R128 to a decimal string, with default formatting.
-// Equivalent to r128ToStringf(dst, dstSize, "%f", v).
-//
-// Uses the R128_decimal global as the decimal point character.
-// Always writes a null terminator, even if the destination buffer is not large enough.
-//
-// Will write at most 42 bytes (including NUL) to dst.
-//
-// Returns the number of bytes that would have been written if dst was sufficiently large,
-// not including the final null terminator.
-//
-extern int r128ToString(char *dst, size_t dstSize, const R128 *v);
-
-// r128FromString: Convert string to R128.
-//
-// The string can be formatted either as a decimal number with optional sign
-// or as hexadecimal with a prefix of 0x or 0X.
-//
-// endptr, if not NULL, is set to the character following the last character
-//   used in the conversion.
-//
-extern void r128FromString(R128 *dst, const char *s, char **endptr);
-
-// Constants
-extern const R128 R128_min;      // minimum (most negative) value
-extern const R128 R128_max;      // maximum (most positive) value
-extern const R128 R128_smallest; // smallest positive value
-extern const R128 R128_zero;     // zero
-extern const R128 R128_one;      // 1.0
-
-extern char R128_decimal;        // decimal point character used by r128From/ToString. defaults to '.'
-
-#ifdef __cplusplus
-}
-
-#include <limits>
-namespace std {
-template<>
-struct numeric_limits<R128>
-{
-   static const bool is_specialized = true;
-
-   static R128 min() throw() { return R128_min; }
-   static R128 max() throw() { return R128_max; }
-
-   static const int digits = 127;
-   static const int digits10 = 38;
-   static const bool is_signed = true;
-   static const bool is_integer = false;
-   static const bool is_exact = false;
-   static const int radix = 2;
-   static R128 epsilon() throw() { return R128_smallest; }
-   static R128 round_error() throw() { return R128_one; }
-
-   static const int min_exponent = 0;
-   static const int min_exponent10 = 0;
-   static const int max_exponent = 0;
-   static const int max_exponent10 = 0;
-
-   static const bool has_infinity = false;
-   static const bool has_quiet_NaN = false;
-   static const bool has_signaling_NaN = false;
-   static const float_denorm_style has_denorm = denorm_absent;
-   static const bool has_denorm_loss = false;
-
-   static R128 infinity() throw() { return R128_zero; }
-   static R128 quiet_NaN() throw() { return R128_zero; }
-   static R128 signaling_NaN() throw() { return R128_zero; }
-   static R128 denorm_min() throw() { return R128_zero; }
-
-   static const bool is_iec559 = false;
-   static const bool is_bounded = true;
-   static const bool is_modulo = true;
-
-   static const bool traps = numeric_limits<R128_U64>::traps;
-   static const bool tinyness_before = false;
-   static const float_round_style round_style = round_toward_zero;
-};
-}  //namespace std
-
-inline R128::R128() {}
-
-inline R128::R128(double v)
-{
-   r128FromFloat(this, v);
-}
-
-inline R128::R128(int v)
-{
-   r128FromInt(this, v);
-}
-
-inline R128::R128(R128_S64 v)
-{
-   r128FromInt(this, v);
-}
-
-inline R128::R128(R128_U64 low, R128_U64 high)
-{
-   lo = low;
-   hi = high;
-}
-
-inline R128::operator double() const
-{
-   return r128ToFloat(this);
-}
-
-inline R128::operator R128_S64() const
-{
-   return r128ToInt(this);
-}
-
-inline R128::operator int() const
-{
-   return (int) r128ToInt(this);
-}
-
-inline R128::operator bool() const
-{
-   return lo || hi;
-}
-
-inline bool R128::operator!() const
-{
-   return !lo && !hi;
-}
-
-inline R128 R128::operator~() const
-{
-   R128 r;
-   r128Not(&r, this);
-   return r;
-}
-
-inline R128 R128::operator-() const
-{
-   R128 r;
-   r128Neg(&r, this);
-   return r;
-}
-
-inline R128 &R128::operator|=(const R128 &rhs)
-{
-   r128Or(this, this, &rhs);
-   return *this;
-}
-
-inline R128 &R128::operator&=(const R128 &rhs)
-{
-   r128And(this, this, &rhs);
-   return *this;
-}
-
-inline R128 &R128::operator^=(const R128 &rhs)
-{
-   r128Xor(this, this, &rhs);
-   return *this;
-}
-
-inline R128 &R128::operator+=(const R128 &rhs)
-{
-   r128Add(this, this, &rhs);
-   return *this;
-}
-
-inline R128 &R128::operator-=(const R128 &rhs)
-{
-   r128Sub(this, this, &rhs);
-   return *this;
-}
-
-inline R128 &R128::operator*=(const R128 &rhs)
-{
-   r128Mul(this, this, &rhs);
-   return *this;
-}
-
-inline R128 &R128::operator/=(const R128 &rhs)
-{
-   r128Div(this, this, &rhs);
-   return *this;
-}
-
-inline R128 &R128::operator%=(const R128 &rhs)
-{
-   r128Mod(this, this, &rhs);
-   return *this;
-}
-
-inline R128 &R128::operator<<=(int amount)
-{
-   r128Shl(this, this, amount);
-   return *this;
-}
-
-inline R128 &R128::operator>>=(int amount)
-{
-   r128Sar(this, this, amount);
-   return *this;
-}
-
-static inline R128 operator|(const R128 &lhs, const R128 &rhs)
-{
-   R128 r(lhs);
-   return r |= rhs;
-}
-
-static inline R128 operator&(const R128 &lhs, const R128 &rhs)
-{
-   R128 r(lhs);
-   return r &= rhs;
-}
-
-static inline R128 operator^(const R128 &lhs, const R128 &rhs)
-{
-   R128 r(lhs);
-   return r ^= rhs;
-}
-
-static inline R128 operator+(const R128 &lhs, const R128 &rhs)
-{
-   R128 r(lhs);
-   return r += rhs;
-}
-
-static inline R128 operator-(const R128 &lhs, const R128 &rhs)
-{
-   R128 r(lhs);
-   return r -= rhs;
-}
-
-static inline R128 operator*(const R128 &lhs, const R128 &rhs)
-{
-   R128 r(lhs);
-   return r *= rhs;
-}
-
-static inline R128 operator/(const R128 &lhs, const R128 &rhs)
-{
-   R128 r(lhs);
-   return r /= rhs;
-}
-
-static inline R128 operator%(const R128 &lhs, const R128 &rhs)
-{
-   R128 r(lhs);
-   return r %= rhs;
-}
-
-static inline R128 operator<<(const R128 &lhs, int amount)
-{
-   R128 r(lhs);
-   return r <<= amount;
-}
-
-static inline R128 operator>>(const R128 &lhs, int amount)
-{
-   R128 r(lhs);
-   return r >>= amount;
-}
-
-static inline bool operator<(const R128 &lhs, const R128 &rhs)
-{
-   return r128Cmp(&lhs, &rhs) < 0;
-}
-
-static inline bool operator>(const R128 &lhs, const R128 &rhs)
-{
-   return r128Cmp(&lhs, &rhs) > 0;
-}
-
-static inline bool operator<=(const R128 &lhs, const R128 &rhs)
-{
-   return r128Cmp(&lhs, &rhs) <= 0;
-}
-
-static inline bool operator>=(const R128 &lhs, const R128 &rhs)
-{
-   return r128Cmp(&lhs, &rhs) >= 0;
-}
-
-static inline bool operator==(const R128 &lhs, const R128 &rhs)
-{
-   return lhs.lo == rhs.lo && lhs.hi == rhs.hi;
-}
-
-static inline bool operator!=(const R128 &lhs, const R128 &rhs)
-{
-   return lhs.lo != rhs.lo || lhs.hi != rhs.hi;
-}
-
-#endif   //__cplusplus
-#endif   //H_R128_H
-
-#ifdef R128_IMPLEMENTATION
-
-#ifdef R128_DEBUG_VIS
-#  define R128_DEBUG_SET(x)   r128ToString(R128_last, sizeof(R128_last), x)
-#else
-#  define R128_DEBUG_SET(x)
-#endif
-
-#define R128_SET2(x, l, h) do { (x)->lo = (R128_U64)(l); (x)->hi = (R128_U64)(h); } while(0)
-#define R128_R0(x) ((R128_U32)(x)->lo)
-#define R128_R2(x) ((R128_U32)(x)->hi)
-#if defined(_M_IX86)
-// workaround: MSVC x86's handling of 64-bit values is not great
-#  define R128_SET4(x, r0, r1, r2, r3) do { \
-      ((R128_U32*)&(x)->lo)[0] = (R128_U32)(r0); \
-      ((R128_U32*)&(x)->lo)[1] = (R128_U32)(r1); \
-      ((R128_U32*)&(x)->hi)[0] = (R128_U32)(r2); \
-      ((R128_U32*)&(x)->hi)[1] = (R128_U32)(r3); \
-      } while(0)
-#  define R128_R1(x) (((R128_U32*)&(x)->lo)[1])
-#  define R128_R3(x) (((R128_U32*)&(x)->hi)[1])
-#else
-#  define R128_SET4(x, r0, r1, r2, r3) do { (x)->lo = (R128_U64)(r0) | ((R128_U64)(r1) << 32); \
-      (x)->hi = (R128_U64)(r2) | ((R128_U64)(r3) << 32); } while(0)
-#  define R128_R1(x) ((R128_U32)((x)->lo >> 32))
-#  define R128_R3(x) ((R128_U32)((x)->hi >> 32))
-#endif
-
-#if defined(_M_X64)
-#  define R128_INTEL 1
-#  define R128_64BIT 1
-#  ifndef R128_STDC_ONLY
-#     include <intrin.h>
-#  endif
-#elif defined(__x86_64__)
-#  define R128_INTEL 1
-#  define R128_64BIT 1
-#  ifndef R128_STDC_ONLY
-#     include <x86intrin.h>
-#  endif
-#elif defined(_M_IX86)
-#  define R128_INTEL 1
-#  ifndef R128_STDC_ONLY
-#     include <intrin.h>
-#  endif
-#elif defined(__i386__)
-#  define R128_INTEL 1
-#  ifndef R128_STDC_ONLY
-#     include <x86intrin.h>
-#  endif
-#elif defined(_M_ARM)
-#  ifndef R128_STDC_ONLY
-#     include <intrin.h>
-#  endif
-#elif defined(_M_ARM64)
-#  define R128_64BIT 1
-#  ifndef R128_STDC_ONLY
-#     include <intrin.h>
-#  endif
-#elif defined(__aarch64__)
-#  define R128_64BIT 1
-#endif
-
-#ifndef R128_INTEL
-#  define R128_INTEL 0
-#endif
-
-#ifndef R128_64BIT
-#  define R128_64BIT 0
-#endif
-
-#ifndef R128_ASSERT
-#  include <assert.h>
-#  define R128_ASSERT(x) assert(x)
-#endif
-
-#include <stdlib.h>  // for NULL
-
-static const R128ToStringFormat R128__defaultFormat = {
-   R128ToStringSign_Default,
-   0,
-   -1,
-   0,
-   0,
-   0
-};
-
-const R128 R128_min = { 0, R128_LIT_U64(0x8000000000000000) };
-const R128 R128_max = { R128_LIT_U64(0xffffffffffffffff), R128_LIT_U64(0x7fffffffffffffff) };
-const R128 R128_smallest = { 1, 0 };
-const R128 R128_zero = { 0, 0 };
-const R128 R128_one = { 0, 1 };
-char R128_decimal = '.';
-#ifdef R128_DEBUG_VIS
-char R128_last[42];
-#endif
-
-static int r128__clz64(R128_U64 x)
-{
-#if defined(R128_STDC_ONLY)
-   R128_U64 n = 64, y;
-   y = x >> 32; if (y) { n -= 32; x = y; }
-   y = x >> 16; if (y) { n -= 16; x = y; }
-   y = x >>  8; if (y) { n -=  8; x = y; }
-   y = x >>  4; if (y) { n -=  4; x = y; }
-   y = x >>  2; if (y) { n -=  2; x = y; }
-   y = x >>  1; if (y) { n -=  1; x = y; }
-   return (int)(n - x);
-#elif defined(_M_X64) || defined(_M_ARM64)
-   unsigned long idx;
-   if (_BitScanReverse64(&idx, x)) {
-      return 63 - (int)idx;
-   } else {
-      return 64;
-   }
-#elif defined(_MSC_VER)
-   unsigned long idx;
-   if (_BitScanReverse(&idx, (R128_U32)(x >> 32))) {
-      return 31 - (int)idx;
-   } else if (_BitScanReverse(&idx, (R128_U32)x)) {
-      return 63 - (int)idx;
-   } else {
-      return 64;
-   }
-#else
-   return x ? __builtin_clzll(x) : 64;
-#endif
-}
-
-#if !R128_64BIT
-// 32*32->64
-static R128_U64 r128__umul64(R128_U32 a, R128_U32 b)
-{
-#  if defined(_M_IX86) && !defined(R128_STDC_ONLY) && !defined(__MINGW32__)
-   return __emulu(a, b);
-#  elif defined(_M_ARM) && !defined(R128_STDC_ONLY)
-   return _arm_umull(a, b);
-#  else
-   return a * (R128_U64)b;
-#  endif
-}
-
-// 64/32->32
-static R128_U32 r128__udiv64(R128_U32 nlo, R128_U32 nhi, R128_U32 d, R128_U32 *rem)
-{
-#  if defined(_M_IX86) && (_MSC_VER >= 1920) && !defined(R128_STDC_ONLY)
-   unsigned __int64 n = ((unsigned __int64)nhi << 32) | nlo;
-   return _udiv64(n, d, rem);
-#  elif defined(_M_IX86) && !defined(R128_STDC_ONLY) && !defined(__MINGW32__)
-   __asm {
-      mov eax, nlo
-      mov edx, nhi
-      div d
-      mov ecx, rem
-      mov dword ptr [ecx], edx
-   }
-#  elif defined(__i386__) && !defined(R128_STDC_ONLY)
-   R128_U32 q, r;
-   __asm("divl %4"
-      : "=a"(q), "=d"(r)
-      : "a"(nlo), "d"(nhi), "X"(d));
-   *rem = r;
-   return q;
-#  else
-   R128_U64 n64 = ((R128_U64)nhi << 32) | nlo;
-   *rem = (R128_U32)(n64 % d);
-   return (R128_U32)(n64 / d);
-#  endif
-}
-#elif defined(R128_STDC_ONLY) || !R128_INTEL
-#define r128__umul64(a, b) ((a) * (R128_U64)(b))
-static R128_U32 r128__udiv64(R128_U32 nlo, R128_U32 nhi, R128_U32 d, R128_U32 *rem)
-{
-   R128_U64 n64 = ((R128_U64)nhi << 32) | nlo;
-   *rem = (R128_U32)(n64 % d);
-   return (R128_U32)(n64 / d);
-}
-#endif   //!R128_64BIT
-
-static void r128__neg(R128 *dst, const R128 *src)
-{
-   R128_ASSERT(dst != NULL);
-   R128_ASSERT(src != NULL);
-
-//#if R128_INTEL && !defined(R128_STDC_ONLY)
-//   {
-//      unsigned char carry = 0;
-//#  if R128_64BIT
-//      carry = _addcarry_u64(carry, ~src->lo, 1, &dst->lo);
-//      carry = _addcarry_u64(carry, ~src->hi, 0, &dst->hi);
-//#  else
-//      R128_U32 r0, r1, r2, r3;
-//      carry = _addcarry_u32(carry, ~R128_R0(src), 1, &r0);
-//      carry = _addcarry_u32(carry, ~R128_R1(src), 0, &r1);
-//      carry = _addcarry_u32(carry, ~R128_R2(src), 0, &r2);
-//      carry = _addcarry_u32(carry, ~R128_R3(src), 0, &r3);
-//      R128_SET4(dst, r0, r1, r2, r3);
-//#  endif //R128_64BIT
-//   }
-//#else
-   if (src->lo) {
-      dst->lo = ~src->lo + 1;
-      dst->hi = ~src->hi;
-   } else {
-      dst->lo = 0;
-      dst->hi = ~src->hi + 1;
-   }
-//#endif   //R128_INTEL
-}
-
-// 64*64->128
-static void r128__umul128(R128 *dst, R128_U64 a, R128_U64 b)
-{
-//#if defined(_M_X64) && !defined(R128_STDC_ONLY)
-//   dst->lo = _umul128(a, b, &dst->hi);
-//#elif R128_64BIT && !defined(_MSC_VER) && !defined(R128_STDC_ONLY)
-//   unsigned __int128 p0 = a * (unsigned __int128)b;
-//   dst->hi = (R128_U64)(p0 >> 64);
-//   dst->lo = (R128_U64)p0;
-//#else
-   R128_U32 alo = (R128_U32)a;
-   R128_U32 ahi = (R128_U32)(a >> 32);
-   R128_U32 blo = (R128_U32)b;
-   R128_U32 bhi = (R128_U32)(b >> 32);
-   R128_U64 p0, p1, p2, p3;
-
-   p0 = alo * (uint64_t)blo;
-   p1 = alo * (uint64_t)bhi;
-   p2 = ahi * (uint64_t)blo;
-   p3 = ahi * (uint64_t)bhi;
-
-   {
-//#if R128_INTEL && !defined(R128_STDC_ONLY)
-//      R128_U32 r0, r1, r2, r3;
-//      unsigned char carry;
-//
-//      r0 = (R128_U32)(p0);
-//      r1 = (R128_U32)(p0 >> 32);
-//      r2 = (R128_U32)(p1 >> 32);
-//      r3 = (R128_U32)(p3 >> 32);
-//
-//      carry = _addcarry_u32(0, r1, (R128_U32)p1, &r1);
-//      carry = _addcarry_u32(carry, r2, (R128_U32)(p2 >> 32), &r2);
-//      _addcarry_u32(carry, r3, 0, &r3);
-//      carry = _addcarry_u32(0, r1, (R128_U32)p2, &r1);
-//      carry = _addcarry_u32(carry, r2, (R128_U32)p3, &r2);
-//      _addcarry_u32(carry, r3, 0, &r3);
-//
-//      R128_SET4(dst, r0, r1, r2, r3);
-//#else
-      R128_U64 carry, lo, hi;
-      carry = ((R128_U64)(R128_U32)p1 + (R128_U64)(R128_U32)p2 + (p0 >> 32)) >> 32;
-
-      lo = p0 + ((p1 + p2) << 32);
-      hi = p3 + ((R128_U32)(p1 >> 32) + (R128_U32)(p2 >> 32)) + carry;
-
-      R128_SET2(dst, lo, hi);
-//#endif
-   }
-//#endif
-}
-
-// 128/64->64
-#if defined(_M_X64) && (_MSC_VER < 1920) && !defined(R128_STDC_ONLY) && !defined(__MINGW32__)
-// MSVC x64 provides neither inline assembly nor (pre-2019) a div intrinsic, so we do fake
-// "inline assembly" to avoid long division or outline assembly.
-#pragma code_seg(".text")
-__declspec(allocate(".text") align(16)) static const unsigned char r128__udiv128Code[] = {
-   0x48, 0x8B, 0xC1,       //mov  rax, rcx
-   0x49, 0xF7, 0xF0,       //div  rax, r8
-   0x49, 0x89, 0x11,       //mov  qword ptr [r9], rdx
-   0xC3                    //ret
-};
-typedef R128_U64 (*r128__udiv128Proc)(R128_U64 nlo, R128_U64 nhi, R128_U64 d, R128_U64 *rem);
-static const r128__udiv128Proc r128__udiv128 = (r128__udiv128Proc)(void*)r128__udiv128Code;
-#else
-static R128_U64 r128__udiv128(R128_U64 nlo, R128_U64 nhi, R128_U64 d, R128_U64 *rem)
-{
-#if defined(_M_X64) && !defined(R128_STDC_ONLY) && !defined(__MINGW32__)
-   return _udiv128(nhi, nlo, d, rem);
-#elif defined(__x86_64__) && !defined(R128_STDC_ONLY)
-   R128_U64 q, r;
-   __asm("divq %4"
-      : "=a"(q), "=d"(r)
-      : "a"(nlo), "d"(nhi), "X"(d));
-   *rem = r;
-   return q;
-#else
-   R128_U64 tmp;
-   R128_U32 d0, d1;
-   R128_U32 n3, n2, n1, n0;
-   R128_U32 q0, q1;
-   R128_U32 r;
-   int shift;
-
-   R128_ASSERT(d != 0);    //division by zero
-   R128_ASSERT(nhi < d);   //overflow
-
-   // normalize
-   shift = r128__clz64(d);
-
-   if (shift) {
-      R128 tmp128;
-      R128_SET2(&tmp128, nlo, nhi);
-      r128Shl(&tmp128, &tmp128, shift);
-      n3 = R128_R3(&tmp128);
-      n2 = R128_R2(&tmp128);
-      n1 = R128_R1(&tmp128);
-      n0 = R128_R0(&tmp128);
-      d <<= shift;
-   } else {
-      n3 = (R128_U32)(nhi >> 32);
-      n2 = (R128_U32)nhi;
-      n1 = (R128_U32)(nlo >> 32);
-      n0 = (R128_U32)nlo;
-   }
-
-   d1 = (R128_U32)(d >> 32);
-   d0 = (R128_U32)d;
-
-   // first digit
-   R128_ASSERT(n3 <= d1);
-   if (n3 < d1) {
-      q1 = r128__udiv64(n2, n3, d1, &r);
-   } else {
-      q1 = 0xffffffffu;
-      r = n2 + d1;
-   }
-refine1:
-   if (r128__umul64(q1, d0) > ((R128_U64)r << 32) + n1) {
-      --q1;
-      if (r < ~d1 + 1) {
-         r += d1;
-         goto refine1;
-      }
-   }
-
-   tmp = ((R128_U64)n2 << 32) + n1 - (r128__umul64(q1, d0) + (r128__umul64(q1, d1) << 32));
-   n2 = (R128_U32)(tmp >> 32);
-   n1 = (R128_U32)tmp;
-
-   // second digit
-   R128_ASSERT(n2 <= d1);
-   if (n2 < d1) {
-      q0 = r128__udiv64(n1, n2, d1, &r);
-   } else {
-      q0 = 0xffffffffu;
-      r = n1 + d1;
-   }
-refine0:
-   if (r128__umul64(q0, d0) > ((R128_U64)r << 32) + n0) {
-      --q0;
-      if (r < ~d1 + 1) {
-         r += d1;
-         goto refine0;
-      }
-   }
-
-   tmp = ((R128_U64)n1 << 32) + n0 - (r128__umul64(q0, d0) + (r128__umul64(q0, d1) << 32));
-   n1 = (R128_U32)(tmp >> 32);
-   n0 = (R128_U32)tmp;
-
-   *rem = (((R128_U64)n1 << 32) + n0) >> shift;
-   return ((R128_U64)q1 << 32) + q0;
-#endif
-}
-#endif
-
-static int r128__ucmp(const R128 *a, const R128 *b)
-{
-   if (a->hi != b->hi) {
-      if (a->hi > b->hi) {
-         return 1;
-      } else {
-         return -1;
-      }
-   } else {
-      if (a->lo == b->lo) {
-         return 0;
-      } else if (a->lo > b->lo) {
-         return 1;
-      } else {
-         return -1;
-      }
-   }
-}
-
-/*void mul(uint64_t op1, uint64_t op2, uint64_t *hi, uint64_t *lo) {
-    uint64_t u1 = (op1 & 0xffffffff),
-             v1 = (op2 & 0xffffffff),
-             t = (u1 * v1),
-             w3 = (t & 0xffffffff),
-             k = (t >> 32);
-
-    op1 >>= 32;
-    t = (op1 * v1) + k;
-    k = (t & 0xffffffff);
-
-    uint64_t w1 = (t >> 32);
-    op2 >>= 32;
-    t = (u1 * op2) + k;
-    k = (t >> 32);
-    *hi = (op1 * op2) + w1 + k;
-    *lo = (t << 32) + w3;
-}*/ 
-
-static void r128__umul(R128 *dst, const R128 *a, const R128 *b)
-{
-//#if defined(_M_X64) && !defined(R128_STDC_ONLY)
-//   R128_U64 t0, t1;
-//   R128_U64 lo, hi = 0;
-//   unsigned char carry;
-//
-//   t0 = _umul128(a->lo, b->lo, &t1);
-//   carry = _addcarry_u64(0, t1, t0 >> 63, &lo);
-//   _addcarry_u64(carry, hi, hi, &hi);
-//
-//   t0 = _umul128(a->lo, b->hi, &t1);
-//   carry = _addcarry_u64(0, lo, t0, &lo);
-//   _addcarry_u64(carry, hi, t1, &hi);
-//
-//   t0 = _umul128(a->hi, b->lo, &t1);
-//   carry = _addcarry_u64(0, lo, t0, &lo);
-//   _addcarry_u64(carry, hi, t1, &hi);
-//
-//   t0 = _umul128(a->hi, b->hi, &t1);
-//   hi += t0;
-//
-//   R128_SET2(dst, lo, hi);
-//#elif defined(__x86_64__) && !defined(R128_STDC_ONLY)
-//   unsigned __int128 p0, p1, p2, p3;
-//   p0 = a->lo * (unsigned __int128)b->lo;
-//   p1 = a->lo * (unsigned __int128)b->hi;
-//   p2 = a->hi * (unsigned __int128)b->lo;
-//   p3 = a->hi * (unsigned __int128)b->hi;
-//
-//   p0 = (p3 << 64) + p2 + p1 + (p0 >> 64) + ((R128_U64)p0 >> 63);
-//   dst->lo = (R128_U64)p0;
-//   dst->hi = (R128_U64)(p0 >> 64);
-//#else
-   R128 albl, ahbl, albh, ahbh;
-
-   r128__umul128(&albl, a->lo, b->lo);
-   r128__umul128(&ahbl, a->hi, b->lo);
-   r128__umul128(&albh, a->lo, b->hi);
-   r128__umul128(&ahbh, a->hi, b->hi);
-
-   R128 sum;
-
-   r128Shr(&ahbh, &ahbh, 60);
-   sum.lo = 0;
-   sum.hi = ahbh.lo;
-
-   albl.lo = albl.hi;
-   albl.hi = 0;
-   r128Add(&sum, &sum, &albl);
-   r128Add(&sum, &sum, &ahbl);
-   r128Add(&sum, &sum, &albh);
-
-   R128_SET2(dst, sum.lo, sum.hi);
-
-   //R128 p0, p1, p2, p3, round;
-
-   //r128__umul128(&p0, a->lo, b->lo);
-   //round.hi = 0; round.lo = p0.lo >> 63;
-   ////r128Shr(&p0, &p0, 64);
-   //r128Add(&p0, &p0, &round);
-
-   //r128__umul128(&p1, a->hi, b->lo);
-   //r128Add(&p0, &p0, &p1);
-
-   //r128__umul128(&p2, a->lo, b->hi);
-   //r128Add(&p0, &p0, &p2);
-
-   //r128__umul128(&p3, a->hi, b->hi);
-   //r128Shl(&p3, &p3, 32);
-   //r128Add(&p0, &p0, &p3);
-
-   //R128_SET2(dst, p0.lo, p0.hi);
-//#endif
-}
-
-// Shift d left until the high bit is set, and shift n left by the same amount.
-// returns non-zero on overflow.
-static int r128__norm(R128 *n, R128 *d, R128_U64 *n2)
-{
-   R128_U64 d0, d1;
-   R128_U64 n0, n1;
-   int shift;
-
-   d1 = d->hi;
-   d0 = d->lo;
-   n1 = n->hi;
-   n0 = n->lo;
-
-   if (d1) {
-      shift = r128__clz64(d1);
-      if (shift) {
-         d1 = (d1 << shift) | (d0 >> (64 - shift));
-         d0 = d0 << shift;
-         *n2 = n1 >> (64 - shift);
-         n1 = (n1 << shift) | (n0 >> (64 - shift));
-         n0 = n0 << shift;
-      } else {
-         *n2 = 0;
-      }
-   } else {
-      shift = r128__clz64(d0);
-      if (r128__clz64(n1) <= shift) {
-         return 1; // overflow
-      }
-
-      if (shift) {
-         d1 = d0 << shift;
-         d0 = 0;
-         *n2 = (n1 << shift) | (n0 >> (64 - shift));
-         n1 = n0 << shift;
-         n0 = 0;
-      } else {
-         d1 = d0;
-         d0 = 0;
-         *n2 = n1;
-         n1 = n0;
-         n0 = 0;
-      }
-   }
-
-   R128_SET2(n, n0, n1);
-   R128_SET2(d, d0, d1);
-   return 0;
-}
-
-static void r128__udiv(R128 *quotient, const R128 *dividend, const R128 *divisor)
-{
-   R128 tmp;
-   R128_U64 d0, d1;
-   R128_U64 n1, n2, n3;
-   R128 q;
-
-   R128_ASSERT(dividend != NULL);
-   R128_ASSERT(divisor != NULL);
-   R128_ASSERT(quotient != NULL);
-   R128_ASSERT(divisor->hi != 0 || divisor->lo != 0);  // divide by zero
-
-   // scale dividend and normalize
-   {
-      R128 n, d;
-      R128_SET2(&n, dividend->lo, dividend->hi);
-      R128_SET2(&d, divisor->lo, divisor->hi);
-      if (r128__norm(&n, &d, &n3)) {
-         R128_SET2(quotient, R128_max.lo, R128_max.hi);
-         return;
-      }
-
-      d1 = d.hi;
-      d0 = d.lo;
-      n2 = n.hi;
-      n1 = n.lo;
-   }
-
-   // first digit
-   R128_ASSERT(n3 <= d1);
-   {
-      R128 t0, t1;
-      t0.lo = n1;
-      if (n3 < d1) {
-         q.hi = r128__udiv128(n2, n3, d1, &t0.hi);
-      } else {
-         q.hi = R128_LIT_U64(0xffffffffffffffff);
-         t0.hi = n2 + d1;
-      }
-
-refine1:
-      r128__umul128(&t1, q.hi, d0);
-      if (r128__ucmp(&t1, &t0) > 0) {
-         --q.hi;
-         if (t0.hi < ~d1 + 1) {
-            t0.hi += d1;
-            goto refine1;
-         }
-      }
-   }
-
-   {
-      R128 t0, t1, t2;
-      t0.hi = n2;
-      t0.lo = n1;
-
-      r128__umul128(&t1, q.hi, d0);
-      r128__umul128(&t2, q.hi, d1);
-
-      t2.hi = t2.lo; t2.lo = 0;  //r128Shl(&t2, &t2, 64);
-      r128Add(&tmp, &t1, &t2);
-      r128Sub(&tmp, &t0, &tmp);
-   }
-   n2 = tmp.hi;
-   n1 = tmp.lo;
-
-   // second digit
-   R128_ASSERT(n2 <= d1);
-   {
-      R128 t0, t1;
-      t0.lo = 0;
-      if (n2 < d1) {
-         q.lo = r128__udiv128(n1, n2, d1, &t0.hi);
-      } else {
-         q.lo = R128_LIT_U64(0xffffffffffffffff);
-         t0.hi = n1 + d1;
-      }
-
-   refine0:
-      r128__umul128(&t1, q.lo, d0);
-      if (r128__ucmp(&t1, &t0) > 0) {
-         --q.lo;
-         if (t0.hi < ~d1 + 1) {
-            t0.hi += d1;
-            goto refine0;
-         }
-      }
-   }
-
-   R128_SET2(quotient, q.lo, q.hi);
-}
-
-static R128_U64 r128__umod(R128 *n, R128 *d)
-{
-   R128_U64 d0, d1;
-   R128_U64 n3, n2, n1;
-   R128_U64 q;
-
-   R128_ASSERT(d != NULL);
-   R128_ASSERT(n != NULL);
-   R128_ASSERT(d->hi != 0 || d->lo != 0);  // divide by zero
-
-   if (r128__norm(n, d, &n3)) {
-      return R128_LIT_U64(0xffffffffffffffff);
-   }
-
-   d1 = d->hi;
-   d0 = d->lo;
-   n2 = n->hi;
-   n1 = n->lo;
-
-   R128_ASSERT(n3 < d1);
-   {
-      R128 t0, t1;
-      t0.lo = n1;
-      q = r128__udiv128(n2, n3, d1, &t0.hi);
-
-   refine1:
-      r128__umul128(&t1, q, d0);
-      if (r128__ucmp(&t1, &t0) > 0) {
-         --q;
-         if (t0.hi < ~d1 + 1) {
-            t0.hi += d1;
-            goto refine1;
-         }
-      }
-   }
-
-   return q;
-}
-
-static int r128__format(char *dst, size_t dstSize, const R128 *v, const R128ToStringFormat *format)
-{
-   char buf[128];
-   R128 tmp;
-   R128_U64 whole;
-   char *cursor, *decimal, *dstp = dst;
-   int sign = 0;
-   int fullPrecision = 1;
-   int width, precision;
-   int padCnt, trail = 0;
-
-   R128_ASSERT(dst != NULL && dstSize > 0);
-   R128_ASSERT(v != NULL);
-   R128_ASSERT(format != NULL);
-
-   --dstSize;
-
-   R128_SET2(&tmp, v->lo, v->hi);
-   if (r128IsNeg(&tmp)) {
-      r128__neg(&tmp, &tmp);
-      sign = 1;
-   }
-
-   width = format->width;
-   if (width < 0) {
-      width = 0;
-   }
-
-   precision = format->precision;
-   if (precision < 0) {
-      // print a maximum of 20 digits
-      fullPrecision = 0;
-      precision = 20;
-   } else if (precision > sizeof(buf) - 21) {
-      trail = precision - (sizeof(buf) - 21);
-      precision -= trail;
-   }
-
-   whole = tmp.hi;
-   decimal = cursor = buf;
-
-   // fractional part first in case a carry into the whole part is required
-   if (tmp.lo || format->decimal) {
-      while (tmp.lo || (fullPrecision && precision)) {
-         if ((int)(cursor - buf) == precision) {
-            if ((R128_S64)tmp.lo < 0) {
-               // round up, propagate carry backwards
-               char *c;
-               for (c = cursor - 1; c >= buf; --c) {
-                  char d = ++*c;
-                  if (d <= '9') {
-                     goto endfrac;
-                  } else {
-                     *c = '0';
-                  }
-               }
-
-               // carry out into the whole part
-               whole++;
-            }
-
-            break;
-         }
-
-         r128__umul128(&tmp, tmp.lo, 10);
-         *cursor++ = (char)tmp.hi + '0';
-      }
-
-   endfrac:
-      if (format->decimal || precision) {
-         decimal = cursor;
-         *cursor++ = R128_decimal;
-      }
-   }
-
-   // whole part
-   do {
-      char digit = (char)(whole % 10);
-      whole /= 10;
-      *cursor++ = digit + '0';
-   } while (whole);
-
-#define R128__WRITE(c) do { if (dstp < dst + dstSize) *dstp = c; ++dstp; } while(0)
-
-   padCnt = width - (int)(cursor - buf) - 1;
-
-   // left padding
-   if (!format->leftAlign) {
-      char padChar = format->zeroPad ? '0' : ' ';
-      if (format->zeroPad) {
-         if (sign) {
-            R128__WRITE('-');
-         } else if (format->sign == R128ToStringSign_Plus) {
-            R128__WRITE('+');
-         } else if (format->sign == R128ToStringSign_Space) {
-            R128__WRITE(' ');
-         } else {
-            ++padCnt;
-         }
-      }
-
-      for (; padCnt > 0; --padCnt) {
-         R128__WRITE(padChar);
-      }
-   }
-
-   if (format->leftAlign || !format->zeroPad) {
-      if (sign) {
-         R128__WRITE('-');
-      } else if (format->sign == R128ToStringSign_Plus) {
-         R128__WRITE('+');
-      } else if (format->sign == R128ToStringSign_Space) {
-         R128__WRITE(' ');
-      } else {
-         ++padCnt;
-      }
-   }
-
-   {
-      char *i;
-
-      // reverse the whole part
-      for (i = cursor - 1; i >= decimal; --i) {
-         R128__WRITE(*i);
-      }
-
-      // copy the fractional part
-      for (i = buf; i < decimal; ++i) {
-         R128__WRITE(*i);
-      }
-   }
-
-   // right padding
-   if (format->leftAlign) {
-      char padChar = format->zeroPad ? '0' : ' ';
-      for (; padCnt > 0; --padCnt) {
-         R128__WRITE(padChar);
-      }
-   }
-
-   // trailing zeroes for very large precision
-   while (trail--) {
-      R128__WRITE('0');
-   }
-
-#undef R128__WRITE
-
-   if (dstp <= dst + dstSize) {
-      *dstp = '\0';
-   } else {
-      dst[dstSize] = '\0';
-   }
-   return (int)(dstp - dst);
-}
-
-void r128FromInt(R128 *dst, R128_S64 v)
-{
-   R128_ASSERT(dst != NULL);
-   dst->lo = 0;
-   dst->hi = (R128_U64)v << 60;
-   R128_DEBUG_SET(dst);
-}
-
-void r128FromFloat(R128 *dst, double v)
-{
-   R128_ASSERT(dst != NULL);
-
-   if (v < -9223372036854775808.0) {
-      r128Copy(dst, &R128_min);
-   } else if (v >= 9223372036854775808.0) {
-      r128Copy(dst, &R128_max);
-   } else {
-      R128 r;
-      int sign = 0;
-
-      if (v < 0) {
-         v = -v;
-         sign = 1;
-      }
-
-      r.hi = ((R128_U64)(R128_S32)v) << 60;
-      v -= (R128_S32)v;
-      v *= (double)((R128_U64)1 << 60);
-      r.hi = (r.hi & 0xF000000000000000) + (R128_U64)v;
-      r.lo = (R128_U64)(v * 18446744073709551616.0);
-
-      //r.hi = (R128_U64)(R128_S64)v;
-      //v -= (R128_S64)v;
-      //r.lo = (R128_U64)(v * 18446744073709551616.0);
-
-      if (sign) {
-         r128__neg(&r, &r);
-      }
-
-      r128Copy(dst, &r);
-   }
-}
-
-void r128FromString(R128 *dst, const char *s, char **endptr)
-{
-   R128_U64 lo = 0, hi = 0;
-   R128_U64 base = 10;
-
-   int sign = 0;
-
-   R128_ASSERT(dst != NULL);
-   R128_ASSERT(s != NULL);
-
-   R128_SET2(dst, 0, 0);
-
-   // consume whitespace
-   for (;;) {
-      if (*s == ' ' || *s == '\t' || *s == '\r' || *s == '\n' || *s == '\v') {
-         ++s;
-      } else {
-         break;
-      }
-   }
-
-   // sign
-   if (*s == '-') {
-      sign = 1;
-      ++s;
-   } else if (*s == '+') {
-      ++s;
-   }
-
-   // parse base prefix
-   if (s[0] == '0' && (s[1] == 'x' || s[1] == 'X')) {
-      base = 16;
-      s += 2;
-   }
-
-   // whole part
-   for (;; ++s) {
-      R128_U64 digit;
-
-      if ('0' <= *s && *s <= '9') {
-         digit = *s - '0';
-      } else if (base == 16 && 'a' <= *s && *s <= 'f') {
-         digit = *s - 'a' + 10;
-      } else if (base == 16 && 'A' <= *s && *s <= 'F') {
-         digit = *s - 'A' + 10;
-      } else {
-         break;
-      }
-
-      hi = hi * base + digit;
-   }
-
-   // fractional part
-   if (*s == R128_decimal) {
-      const char *exp = ++s;
-
-      // find the last digit and work backwards
-      for (;; ++s) {
-         if ('0' <= *s && *s <= '9') {
-         } else if (base == 16 && ('a' <= *s && *s <= 'f')) {
-         } else if (base == 16 && ('A' <= *s && *s <= 'F')) {
-         } else {
-            break;
-         }
-      }
-
-      for (const char *c = s - 1; c >= exp; --c) {
-         R128_U64 digit, unused;
-
-         if ('0' <= *c && *c <= '9') {
-            digit = *c - '0';
-         } else if ('a' <= *c && *c <= 'f') {
-            digit = *c - 'a' + 10;
-         } else {
-            digit = *c - 'A' + 10;
-         }
-
-         lo = r128__udiv128(lo, digit, base, &unused);
-      }
-   }
-
-   R128_SET2(dst, lo, hi);
-   if (sign) {
-      r128__neg(dst, dst);
-   }
-
-   if (endptr) {
-      *endptr = (char *) s;
-   }
-}
-
-R128_S64 r128ToInt(const R128 *v)
-{
-   R128_ASSERT(v != NULL);
-   if ((R128_S64)v->hi < 0) {
-      return (R128_S64)v->hi + (v->lo != 0);
-   } else {
-      return (R128_S64)v->hi;
-   }
-}
-
-double r128ToFloat(const R128 *v)
-{
-   R128 tmp;
-   int sign = 0;
-   double d;
-
-   R128_ASSERT(v != NULL);
-
-   R128_SET2(&tmp, v->lo, v->hi);
-   if (r128IsNeg(&tmp)) {
-      r128__neg(&tmp, &tmp);
-      sign = 1;
-   }
-
-   d = (tmp.hi >> 60);
-   d += (tmp.hi & 0xFFFFFFFFFFFFFF) / (double)((uint64_t)1 << 60);
-   d += tmp.lo / (double)((uint64_t)1 << 60) / 18446744073709551616.0;
-   
-   if (sign) {
-      d = -d;
-   }
-
-   return d;
-}
-
-int r128ToStringOpt(char *dst, size_t dstSize, const R128 *v, const R128ToStringFormat *opt)
-{
-   return r128__format(dst, dstSize, v, opt);
-}
-
-int r128ToStringf(char *dst, size_t dstSize, const char *format, const R128 *v)
-{
-   R128ToStringFormat opts;
-
-   R128_ASSERT(dst != NULL && dstSize);
-   R128_ASSERT(format != NULL);
-   R128_ASSERT(v != NULL);
-
-   opts.sign = R128__defaultFormat.sign;
-   opts.precision = R128__defaultFormat.precision;
-   opts.zeroPad = R128__defaultFormat.zeroPad;
-   opts.decimal = R128__defaultFormat.decimal;
-   opts.leftAlign = R128__defaultFormat.leftAlign;
-
-   if (*format == '%') {
-      ++format;
-   }
-
-   // flags field
-   for (;; ++format) {
-      if (*format == ' ' && opts.sign != R128ToStringSign_Plus) {
-         opts.sign = R128ToStringSign_Space;
-      } else if (*format == '+') {
-         opts.sign = R128ToStringSign_Plus;
-      } else if (*format == '0') {
-         opts.zeroPad = 1;
-      } else if (*format == '-') {
-         opts.leftAlign = 1;
-      } else if (*format == '#') {
-         opts.decimal = 1;
-      } else {
-         break;
-      }
-   }
-
-   // width field
-   opts.width = 0;
-   for (;;) {
-      if ('0' <= *format && *format <= '9') {
-         opts.width = opts.width * 10 + *format++ - '0';
-      } else {
-         break;
-      }
-   }
-
-   // precision field
-   if (*format == '.') {
-      opts.precision = 0;
-      ++format;
-      for (;;) {
-         if ('0' <= *format && *format <= '9') {
-            opts.precision = opts.precision * 10 + *format++ - '0';
-         } else {
-            break;
-         }
-      }
-   }
-
-   return r128__format(dst, dstSize, v, &opts);
-}
-
-int r128ToString(char *dst, size_t dstSize, const R128 *v)
-{
-   return r128__format(dst, dstSize, v, &R128__defaultFormat);
-}
-
-void r128Copy(R128 *dst, const R128 *src)
-{
-   R128_ASSERT(dst != NULL);
-   R128_ASSERT(src != NULL);
-   dst->lo = src->lo;
-   dst->hi = src->hi;
-   R128_DEBUG_SET(dst);
-}
-
-void r128Neg(R128 *dst, const R128 *v)
-{
-   r128__neg(dst, v);
-   R128_DEBUG_SET(dst);
-}
-
-void r128Abs(R128* dst, const R128* v)
-{
-    R128 sign, inv;
-
-    R128_ASSERT(dst != NULL);
-    R128_ASSERT(v != NULL);
-
-    sign.lo = sign.hi = (R128_U64)(((R128_S64)v->hi) >> 63);
-    inv.lo = v->lo ^ sign.lo;
-    inv.hi = v->hi ^ sign.hi;
-
-    r128Sub(dst, &inv, &sign);
-}
-
-void r128Nabs(R128* dst, const R128* v)
-{
-    R128 sign, inv;
-
-    R128_ASSERT(dst != NULL);
-    R128_ASSERT(v != NULL);
-
-    sign.lo = sign.hi = (R128_U64)(((R128_S64)v->hi) >> 63);
-    inv.lo = v->lo ^ sign.lo;
-    inv.hi = v->hi ^ sign.hi;
-
-    r128Sub(dst, &sign, &inv);
-}
-
-void r128Not(R128 *dst, const R128 *src)
-{
-   R128_ASSERT(dst != NULL);
-   R128_ASSERT(src != NULL);
-
-   dst->lo = ~src->lo;
-   dst->hi = ~src->hi;
-   R128_DEBUG_SET(dst);
-}
-
-void r128Or(R128 *dst, const R128 *a, const R128 *b)
-{
-   R128_ASSERT(dst != NULL);
-   R128_ASSERT(a != NULL);
-   R128_ASSERT(b != NULL);
-
-   dst->lo = a->lo | b->lo;
-   dst->hi = a->hi | b->hi;
-   R128_DEBUG_SET(dst);
-}
-
-void r128And(R128 *dst, const R128 *a, const R128 *b)
-{
-   R128_ASSERT(dst != NULL);
-   R128_ASSERT(a != NULL);
-   R128_ASSERT(b != NULL);
-
-   dst->lo = a->lo & b->lo;
-   dst->hi = a->hi & b->hi;
-   R128_DEBUG_SET(dst);
-}
-
-void r128Xor(R128 *dst, const R128 *a, const R128 *b)
-{
-   R128_ASSERT(dst != NULL);
-   R128_ASSERT(a != NULL);
-   R128_ASSERT(b != NULL);
-
-   dst->lo = a->lo ^ b->lo;
-   dst->hi = a->hi ^ b->hi;
-   R128_DEBUG_SET(dst);
-}
-
-void r128Shl(R128 *dst, const R128 *src, int amount)
-{
-   R128_U64 r[4];
-
-   R128_ASSERT(dst != NULL);
-   R128_ASSERT(src != NULL);
-
-#if defined(_M_IX86) && !defined(R128_STDC_ONLY) && !defined(__MINGW32__)
-   __asm {
-      // load src
-      mov edx, dword ptr[src]
-      mov ecx, amount
-
-      mov edi, dword ptr[edx]
-      mov esi, dword ptr[edx + 4]
-      mov ebx, dword ptr[edx + 8]
-      mov eax, dword ptr[edx + 12]
-
-      // shift mod 32
-      shld eax, ebx, cl
-      shld ebx, esi, cl
-      shld esi, edi, cl
-      shl edi, cl
-
-      // clear out low 12 bytes of stack
-      xor edx, edx
-      mov dword ptr[r], edx
-      mov dword ptr[r + 4], edx
-      mov dword ptr[r + 8], edx
-
-      // store shifted amount offset by count/32 bits
-      shr ecx, 5
-      and ecx, 3
-      mov dword ptr[r + ecx * 4 + 0], edi
-      mov dword ptr[r + ecx * 4 + 4], esi
-      mov dword ptr[r + ecx * 4 + 8], ebx
-      mov dword ptr[r + ecx * 4 + 12], eax
-   }
-#else
-
-   r[0] = src->lo;
-   r[1] = src->hi;
-
-   amount &= 127;
-   if (amount >= 64) {
-      r[1] = r[0] << (amount - 64);
-      r[0] = 0;
-   } else if (amount) {
-#  ifdef _M_X64
-      r[1] = __shiftleft128(r[0], r[1], (char) amount);
-#  else
-      r[1] = (r[1] << amount) | (r[0] >> (64 - amount));
-#  endif
-      r[0] = r[0] << amount;
-   }
-#endif   //_M_IX86
-
-   dst->lo = r[0];
-   dst->hi = r[1];
-   R128_DEBUG_SET(dst);
-}
-
-void r128Shr(R128 *dst, const R128 *src, int amount)
-{
-   R128_U64 r[4];
-
-   R128_ASSERT(dst != NULL);
-   R128_ASSERT(src != NULL);
-
-#if defined(_M_IX86) && !defined(R128_STDC_ONLY) && !defined(__MINGW32__)
-   __asm {
-      // load src
-      mov edx, dword ptr[src]
-      mov ecx, amount
-
-      mov edi, dword ptr[edx]
-      mov esi, dword ptr[edx + 4]
-      mov ebx, dword ptr[edx + 8]
-      mov eax, dword ptr[edx + 12]
-
-      // shift mod 32
-      shrd edi, esi, cl
-      shrd esi, ebx, cl
-      shrd ebx, eax, cl
-      shr eax, cl
-
-      // clear out high 12 bytes of stack
-      xor edx, edx
-      mov dword ptr[r + 20], edx
-      mov dword ptr[r + 24], edx
-      mov dword ptr[r + 28], edx
-
-      // store shifted amount offset by -count/32 bits
-      shr ecx, 5
-      and ecx, 3
-      neg ecx
-      mov dword ptr[r + ecx * 4 + 16], edi
-      mov dword ptr[r + ecx * 4 + 20], esi
-      mov dword ptr[r + ecx * 4 + 24], ebx
-      mov dword ptr[r + ecx * 4 + 28], eax
-   }
-#else
-   r[2] = src->lo;
-   r[3] = src->hi;
-
-   amount &= 127;
-   if (amount >= 64) {
-      r[2] = r[3] >> (amount - 64);
-      r[3] = 0;
-   } else if (amount) {
-#ifdef _M_X64
-      r[2] = __shiftright128(r[2], r[3], (char) amount);
-#else
-      r[2] = (r[2] >> amount) | (r[3] << (64 - amount));
-#endif
-      r[3] = r[3] >> amount;
-   }
-#endif
-
-   dst->lo = r[2];
-   dst->hi = r[3];
-   R128_DEBUG_SET(dst);
-}
-
-void r128Sar(R128 *dst, const R128 *src, int amount)
-{
-   R128_U64 r[4];
-
-   R128_ASSERT(dst != NULL);
-   R128_ASSERT(src != NULL);
-
-#if defined(_M_IX86) && !defined(R128_STDC_ONLY) && !defined(__MINGW32__)
-   __asm {
-      // load src
-      mov edx, dword ptr[src]
-      mov ecx, amount
-
-      mov edi, dword ptr[edx]
-      mov esi, dword ptr[edx + 4]
-      mov ebx, dword ptr[edx + 8]
-      mov eax, dword ptr[edx + 12]
-
-      // shift mod 32
-      shrd edi, esi, cl
-      shrd esi, ebx, cl
-      shrd ebx, eax, cl
-      sar eax, cl
-
-      // copy sign to high 12 bytes of stack
-      cdq
-      mov dword ptr[r + 20], edx
-      mov dword ptr[r + 24], edx
-      mov dword ptr[r + 28], edx
-
-      // store shifted amount offset by -count/32 bits
-      shr ecx, 5
-      and ecx, 3
-      neg ecx
-      mov dword ptr[r + ecx * 4 + 16], edi
-      mov dword ptr[r + ecx * 4 + 20], esi
-      mov dword ptr[r + ecx * 4 + 24], ebx
-      mov dword ptr[r + ecx * 4 + 28], eax
-   }
-#else
-   r[2] = src->lo;
-   r[3] = src->hi;
-
-   amount &= 127;
-   if (amount >= 64) {
-      r[2] = (R128_U64)((R128_S64)r[3] >> (amount - 64));
-      r[3] = (R128_U64)((R128_S64)r[3] >> 63);
-   } else if (amount) {
-      r[2] = (r[2] >> amount) | (R128_U64)((R128_S64)r[3] << (64 - amount));
-      r[3] = (R128_U64)((R128_S64)r[3] >> amount);
-   }
-#endif
-
-   dst->lo = r[2];
-   dst->hi = r[3];
-   R128_DEBUG_SET(dst);
-}
-
-void r128Add(R128 *dst, const R128 *a, const R128 *b)
-{
-   unsigned char carry = 0;
-   R128_ASSERT(dst != NULL);
-   R128_ASSERT(a != NULL);
-   R128_ASSERT(b != NULL);
-
-#if R128_INTEL && !defined(R128_STDC_ONLY)
-#  if R128_64BIT
-   carry = _addcarry_u64(carry, a->lo, b->lo, &dst->lo);
-   carry = _addcarry_u64(carry, a->hi, b->hi, &dst->hi);
-#  else
-   R128_U32 r0, r1, r2, r3;
-   carry = _addcarry_u32(carry, R128_R0(a), R128_R0(b), &r0);
-   carry = _addcarry_u32(carry, R128_R1(a), R128_R1(b), &r1);
-   carry = _addcarry_u32(carry, R128_R2(a), R128_R2(b), &r2);
-   carry = _addcarry_u32(carry, R128_R3(a), R128_R3(b), &r3);
-   R128_SET4(dst, r0, r1, r2, r3);
-#  endif //R128_64BIT
-#else
-   {
-      R128_U64 r = a->lo + b->lo;
-      carry = r < a->lo;
-      dst->lo = r;
-      dst->hi = a->hi + b->hi + carry;
-   }
-#endif   //R128_INTEL
-
-   R128_DEBUG_SET(dst);
-}
-
-void r128Sub(R128 *dst, const R128 *a, const R128 *b)
-{
-   unsigned char borrow = 0;
-   R128_ASSERT(dst != NULL);
-   R128_ASSERT(a != NULL);
-   R128_ASSERT(b != NULL);
-
-#if R128_INTEL && !defined(R128_STDC_ONLY)
-#  if R128_64BIT
-   borrow = _subborrow_u64(borrow, a->lo, b->lo, &dst->lo);
-   borrow = _subborrow_u64(borrow, a->hi, b->hi, &dst->hi);
-#  else
-   R128_U32 r0, r1, r2, r3;
-   borrow = _subborrow_u32(borrow, R128_R0(a), R128_R0(b), &r0);
-   borrow = _subborrow_u32(borrow, R128_R1(a), R128_R1(b), &r1);
-   borrow = _subborrow_u32(borrow, R128_R2(a), R128_R2(b), &r2);
-   borrow = _subborrow_u32(borrow, R128_R3(a), R128_R3(b), &r3);
-   R128_SET4(dst, r0, r1, r2, r3);
-#  endif //R128_64BIT
-#else
-   {
-      R128_U64 r = a->lo - b->lo;
-      borrow = r > a->lo;
-      dst->lo = r;
-      dst->hi = a->hi - b->hi - borrow;
-   }
-#endif   //R128_INTEL
-
-   R128_DEBUG_SET(dst);
-}
-
-void r128Mul(R128 *dst, const R128 *a, const R128 *b)
-{
-   int sign = 0;
-   R128 ta, tb, tc;
-
-   R128_ASSERT(dst != NULL);
-   R128_ASSERT(a != NULL);
-   R128_ASSERT(b != NULL);
-
-   R128_SET2(&ta, a->lo, a->hi);
-   R128_SET2(&tb, b->lo, b->hi);
-
-   if (r128IsNeg(&ta)) {
-      r128__neg(&ta, &ta);
-      sign = !sign;
-   }
-   if (r128IsNeg(&tb)) {
-      r128__neg(&tb, &tb);
-      sign = !sign;
-   }
-
-   r128__umul(&tc, &ta, &tb);
-   if (sign) {
-      r128__neg(&tc, &tc);
-   }
-
-   r128Copy(dst, &tc);
-}
-
-void r128Div(R128 *dst, const R128 *a, const R128 *b)
-{
-   int sign = 0;
-   R128 tn, td, tq;
-
-   R128_ASSERT(dst != NULL);
-   R128_ASSERT(a != NULL);
-   R128_ASSERT(b != NULL);
-
-   R128_SET2(&tn, a->lo, a->hi);
-   R128_SET2(&td, b->lo, b->hi);
-
-   if (r128IsNeg(&tn)) {
-      r128__neg(&tn, &tn);
-      sign = !sign;
-   }
-
-   if (td.lo == 0 && td.hi == 0) {
-      // divide by zero
-      if (sign) {
-         r128Copy(dst, &R128_min);
-      } else {
-         r128Copy(dst, &R128_max);
-      }
-      return;
-   } else if (r128IsNeg(&td)) {
-      r128__neg(&td, &td);
-      sign = !sign;
-   }
-
-   r128__udiv(&tq, &tn, &td);
-
-   if (sign) {
-      r128__neg(&tq, &tq);
-   }
-
-   r128Copy(dst, &tq);
-}
-
-void r128Mod(R128 *dst, const R128 *a, const R128 *b)
-{
-   int sign = 0;
-   R128 tn, td, tq;
-
-   R128_ASSERT(dst != NULL);
-   R128_ASSERT(a != NULL);
-   R128_ASSERT(b != NULL);
-
-   R128_SET2(&tn, a->lo, a->hi);
-   R128_SET2(&td, b->lo, b->hi);
-
-   if (r128IsNeg(&tn)) {
-      r128__neg(&tn, &tn);
-      sign = !sign;
-   }
-
-   if (td.lo == 0 && td.hi == 0) {
-      // divide by zero
-      if (sign) {
-         r128Copy(dst, &R128_min);
-      } else {
-         r128Copy(dst, &R128_max);
-      }
-      return;
-   } else if (r128IsNeg(&td)) {
-      r128__neg(&td, &td);
-      sign = !sign;
-   }
-
-   tq.hi = r128__umod(&tn, &td);
-   tq.lo = 0;
-
-   if (sign) {
-      tq.hi = ~tq.hi + 1;
-   }
-
-   r128Mul(&tq, &tq, b);
-   r128Sub(dst, a, &tq);
-}
-
-void r128Rsqrt(R128 *dst, const R128 *v)
-{
-   static const R128 threeHalves = { R128_LIT_U64(0x8000000000000000), 1 };
-   R128 x, est;
-   int i;
-
-   if ((R128_S64)v->hi < 0) {
-      r128Copy(dst, &R128_min);
-      return;
-   }
-
-   R128_SET2(&x, v->lo, v->hi);
-
-   // get initial estimate
-   if (x.hi) {
-      int shift = (64 + r128__clz64(x.hi)) >> 1;
-      est.lo = R128_LIT_U64(1) << shift;
-      est.hi = 0;
-   } else if (x.lo) {
-      int shift = r128__clz64(x.lo) >> 1;
-      est.hi = R128_LIT_U64(1) << shift;
-      est.lo = 0;
-   } else {
-      R128_SET2(dst, 0, 0);
-      return;
-   }
-
-   // x /= 2
-   r128Shr(&x, &x, 1);
-
-   // Newton-Raphson iterate
-   for (i = 0; i < 7; ++i) {
-      R128 newEst;
-
-      // newEst = est * (threeHalves - (x / 2) * est * est);
-      r128__umul(&newEst, &est, &est);
-      r128__umul(&newEst, &newEst, &x);
-      r128Sub(&newEst, &threeHalves, &newEst);
-      r128__umul(&newEst, &est, &newEst);
-
-      if (newEst.lo == est.lo && newEst.hi == est.hi) {
-         break;
-      }
-      R128_SET2(&est, newEst.lo, newEst.hi);
-   }
-
-   r128Copy(dst, &est);
-}
-
-void r128Sqrt(R128 *dst, const R128 *v)
-{
-   R128 x, est;
-   int i;
-
-   if ((R128_S64)v->hi < 0) {
-      r128Copy(dst, &R128_min);
-      return;
-   }
-
-   R128_SET2(&x, v->lo, v->hi);
-
-   // get initial estimate
-   if (x.hi) {
-      int shift = (63 - r128__clz64(x.hi)) >> 1;
-      r128Shr(&est, &x, shift);
-   } else if (x.lo) {
-      int shift = (1 + r128__clz64(x.lo)) >> 1;
-      r128Shl(&est, &x, shift);
-   } else {
-      R128_SET2(dst, 0, 0);
-      return;
-   }
-
-   // Newton-Raphson iterate
-   for (i = 0; i < 7; ++i) {
-      R128 newEst;
-
-      // newEst = (est + x / est) / 2
-      r128__udiv(&newEst, &x, &est);
-      r128Add(&newEst, &newEst, &est);
-      r128Shr(&newEst, &newEst, 1);
-
-      if (newEst.lo == est.lo && newEst.hi == est.hi) {
-         break;
-      }
-      R128_SET2(&est, newEst.lo, newEst.hi);
-   }
-
-   r128Copy(dst, &est);
-}
-
-int r128Cmp(const R128 *a, const R128 *b)
-{
-   R128_ASSERT(a != NULL);
-   R128_ASSERT(b != NULL);
-
-   if (a->hi == b->hi) {
-      if (a->lo == b->lo) {
-         return 0;
-      } else if (a->lo > b->lo) {
-         return 1;
-      } else {
-         return -1;
-      }
-   } else if ((R128_S64)a->hi > (R128_S64)b->hi) {
-      return 1;
-   } else {
-      return -1;
-   }
-}
-
-int r128IsNeg(const R128 *v)
-{
-   R128_ASSERT(v != NULL);
-
-   return (R128_S64)v->hi < 0;
-}
-
-void r128Min(R128 *dst, const R128 *a, const R128 *b)
-{
-   R128_ASSERT(dst != NULL);
-   R128_ASSERT(a != NULL);
-   R128_ASSERT(b != NULL);
-
-   if (r128Cmp(a, b) < 0) {
-      r128Copy(dst, a);
-   } else {
-      r128Copy(dst, b);
-   }
-}
-
-void r128Max(R128 *dst, const R128 *a, const R128 *b)
-{
-   R128_ASSERT(dst != NULL);
-   R128_ASSERT(a != NULL);
-   R128_ASSERT(b != NULL);
-
-   if (r128Cmp(a, b) > 0) {
-      r128Copy(dst, a);
-   } else {
-      r128Copy(dst, b);
-   }
-}
-
-void r128Floor(R128 *dst, const R128 *v)
-{
-   R128_ASSERT(dst != NULL);
-   R128_ASSERT(v != NULL);
-
-   dst->hi = v->hi;
-   dst->lo = 0;
-   R128_DEBUG_SET(dst);
-}
-
-void r128Ceil(R128 *dst, const R128 *v)
-{
-   R128_ASSERT(dst != NULL);
-   R128_ASSERT(v != NULL);
-
-   dst->hi = v->hi + (v->lo != 0);
-   dst->lo = 0;
-   R128_DEBUG_SET(dst);
-}
-
-void r128Round(R128* dst, const R128* v)
-{
-   R128_ASSERT(dst != NULL);
-   R128_ASSERT(v != NULL);
-
-   dst->hi = v->hi + (v->lo >= R128_LIT_U64(0x8000000000000000) + (R128_U64)((R128_S64)v->hi < 0));
-   dst->lo = 0;
-   R128_DEBUG_SET(dst);
-}
-
-#endif   //R128_IMPLEMENTATION
-