import numpy as np
from ctypes import byref, c_size_t
from numba.cuda.cudadrv.driver import device_memset, driver, USE_NV_BINDING
from numba import cuda
from numba.cuda.testing import unittest, ContextResettingTestCase
from numba.cuda.testing import skip_on_cudasim, skip_on_arm
from numba.tests.support import linux_only


@skip_on_cudasim('CUDA Driver API unsupported in the simulator')
@linux_only
@skip_on_arm('Managed Alloc support is experimental/untested on ARM')
class TestManagedAlloc(ContextResettingTestCase):

    def get_total_gpu_memory(self):
        # We use a driver function to directly get the total GPU memory because
        # an EMM plugin may report something different (or not implement
        # get_memory_info at all).
        if USE_NV_BINDING:
            free, total = driver.cuMemGetInfo()
            return total
        else:
            free = c_size_t()
            total = c_size_t()
            driver.cuMemGetInfo(byref(free), byref(total))
            return total.value

    def skip_if_cc_major_lt(self, min_required, reason):
        """
        Skip the current test if the compute capability of the device is
        less than `min_required`.
        """
        ctx = cuda.current_context()
        cc_major = ctx.device.compute_capability[0]
        if cc_major < min_required:
            self.skipTest(reason)

    # CUDA Unified Memory comes in two flavors. For GPUs in the Kepler and
    # Maxwell generations, managed memory allocations work as opaque,
    # contiguous segments that can either be on the device or the host. For
    # GPUs in the Pascal or later generations, managed memory operates on a
    # per-page basis, so we can have arrays larger than GPU memory, where only
    # part of them is resident on the device at one time. To ensure that this
    # test works correctly on all supported GPUs, we'll select the size of our
    # memory such that we only oversubscribe the GPU memory if we're on a
    # Pascal or newer GPU (compute capability at least 6.0).

    def test_managed_alloc_driver_undersubscribe(self):
        msg = "Managed memory unsupported prior to CC 3.0"
        self.skip_if_cc_major_lt(3, msg)
        self._test_managed_alloc_driver(0.5)

    # This test is skipped by default because it is easy to hang the machine
    # for a very long time or get OOM killed if the GPU memory size is >50% of
    # the system memory size. Even if the system does have more than 2x the RAM
    # of the GPU, this test runs for a very long time (in comparison to the
    # rest of the tests in the suite).
    #
    # However, it is left in here for manual testing as required.

    @unittest.skip
    def test_managed_alloc_driver_oversubscribe(self):
        msg = "Oversubscription of managed memory unsupported prior to CC 6.0"
        self.skip_if_cc_major_lt(6, msg)
        self._test_managed_alloc_driver(2.0)

    def test_managed_alloc_driver_host_attach(self):
        msg = "Host attached managed memory is not accessible prior to CC 6.0"
        self.skip_if_cc_major_lt(6, msg)
        # Only test with a small array (0.01 * memory size) to keep the test
        # quick.
        self._test_managed_alloc_driver(0.01, attach_global=False)

    def _test_managed_alloc_driver(self, memory_factor, attach_global=True):
        # Verify that we can allocate and operate on managed
        # memory through the CUDA driver interface.

        total_mem_size = self.get_total_gpu_memory()
        n_bytes = int(memory_factor * total_mem_size)

        ctx = cuda.current_context()
        mem = ctx.memallocmanaged(n_bytes, attach_global=attach_global)

        dtype = np.dtype(np.uint8)
        n_elems = n_bytes // dtype.itemsize
        ary = np.ndarray(shape=n_elems, dtype=dtype, buffer=mem)

        magic = 0xab
        device_memset(mem, magic, n_bytes)
        ctx.synchronize()

        # Note that this assertion operates on the CPU, so this
        # test effectively drives both the CPU and the GPU on
        # managed memory.

        self.assertTrue(np.all(ary == magic))

    def _test_managed_array(self, attach_global=True):
        # Check the managed_array interface on both host and device.

        ary = cuda.managed_array(100, dtype=np.double)
        ary.fill(123.456)
        self.assertTrue(all(ary == 123.456))

        @cuda.jit('void(double[:])')
        def kernel(x):
            i = cuda.grid(1)
            if i < x.shape[0]:
                x[i] = 1.0

        kernel[10, 10](ary)
        cuda.current_context().synchronize()

        self.assertTrue(all(ary == 1.0))

    def test_managed_array_attach_global(self):
        self._test_managed_array()

    def test_managed_array_attach_host(self):
        self._test_managed_array()
        msg = "Host attached managed memory is not accessible prior to CC 6.0"
        self.skip_if_cc_major_lt(6, msg)
        self._test_managed_array(attach_global=False)


if __name__ == '__main__':
    unittest.main()