From dbfd4b110d7e859a1b466fc4c5fc6d5af804165d Mon Sep 17 00:00:00 2001
From: Stuart Archibald <redacted>
Date: Fri, 22 Feb 2019 10:39:09 +0000
Subject: [PATCH 09/15] intel mkl_mem all

---
 numpy/core/include/numpy/ndarraytypes.h       |  40 ++---
 numpy/core/setup.py                           |  26 ++-
 numpy/core/src/common/numpyos.c               |   1 +
 numpy/core/src/common/ucsnarrow.c             |   1 +
 numpy/core/src/mkl_defs/aligned_alloc.c       | 149 ++++++++++++++++++
 numpy/core/src/mkl_defs/aligned_alloc.h       |  11 ++
 numpy/core/src/mkl_defs/mkl_cpy.c             |  86 ++++++++++
 numpy/core/src/mkl_defs/mkl_cpy.h             |  10 ++
 numpy/core/src/multiarray/alloc.c             |  20 +++
 .../core/src/multiarray/array_assign_scalar.c |   1 +
 numpy/core/src/multiarray/arrayobject.c       |   2 +
 numpy/core/src/multiarray/arraytypes.c.src    |   2 +
 numpy/core/src/multiarray/compiled_base.c     |   2 +
 numpy/core/src/multiarray/convert.c           |   1 +
 numpy/core/src/multiarray/convert_datatype.c  |   1 +
 numpy/core/src/multiarray/datetime.c          |   2 +
 numpy/core/src/multiarray/datetime_busday.c   |   1 +
 .../core/src/multiarray/datetime_busdaycal.c  |   2 +
 numpy/core/src/multiarray/datetime_strings.c  |   1 +
 numpy/core/src/multiarray/descriptor.c        |   2 +
 numpy/core/src/multiarray/dtype_transfer.c    |   2 +
 numpy/core/src/multiarray/einsum.c.src        |   2 +
 numpy/core/src/multiarray/flagsobject.c       |   1 +
 numpy/core/src/multiarray/getset.c            |   2 +
 numpy/core/src/multiarray/item_selection.c    |  32 +++-
 numpy/core/src/multiarray/iterators.c         |   2 +
 .../multiarray/lowlevel_strided_loops.c.src   |   1 +
 numpy/core/src/multiarray/mapping.c           |   1 +
 numpy/core/src/multiarray/methods.c           |   1 +
 numpy/core/src/multiarray/multiarraymodule.c  |   1 +
 numpy/core/src/multiarray/nditer_api.c        |   2 +
 numpy/core/src/multiarray/nditer_constr.c     |   2 +
 numpy/core/src/multiarray/nditer_templ.c.src  |   1 +
 numpy/core/src/multiarray/scalarapi.c         |   1 +
 numpy/core/src/multiarray/scalartypes.c.src   |   2 +
 numpy/core/src/multiarray/sequence.c          |   1 +
 numpy/core/src/multiarray/shape.c             |   1 +
 numpy/core/src/npysort/npysort_common.h       |   1 +
 numpy/core/src/umath/_rational_tests.c.src    |   1 +
 numpy/core/src/umath/reduction.c              |   1 +
 numpy/core/src/umath/simd.inc.src             |   1 +
 numpy/core/src/umath/ufunc_object.c           |   2 +
 numpy/core/src/umath/ufunc_type_resolution.c  |   2 +
 numpy/core/src/umath/umathmodule.c            |   2 +
 numpy/core/tests/test_regression.py           |  11 ++
 numpy/random/setup.py                         |   4 +-
 46 files changed, 411 insertions(+), 30 deletions(-)
 create mode 100644 numpy/core/src/mkl_defs/aligned_alloc.c
 create mode 100644 numpy/core/src/mkl_defs/aligned_alloc.h
 create mode 100644 numpy/core/src/mkl_defs/mkl_cpy.c
 create mode 100644 numpy/core/src/mkl_defs/mkl_cpy.h

diff --git a/numpy/core/include/numpy/ndarraytypes.h b/numpy/core/include/numpy/ndarraytypes.h
index b0b749c..9eeb5f9 100644
--- a/numpy/core/include/numpy/ndarraytypes.h
+++ b/numpy/core/include/numpy/ndarraytypes.h
@@ -345,25 +345,27 @@ struct NpyAuxData_tag {
 
 #define NPY_USE_PYMEM 1
 
-#if NPY_USE_PYMEM == 1
-   /* numpy sometimes calls PyArray_malloc() with the GIL released. On Python
-      3.3 and older, it was safe to call PyMem_Malloc() with the GIL released.
-      On Python 3.4 and newer, it's better to use PyMem_RawMalloc() to be able
-      to use tracemalloc. On Python 3.6, calling PyMem_Malloc() with the GIL
-      released is now a fatal error in debug mode. */
-#  if PY_VERSION_HEX >= 0x03040000
-#    define PyArray_malloc PyMem_RawMalloc
-#    define PyArray_free PyMem_RawFree
-#    define PyArray_realloc PyMem_RawRealloc
-#  else
-#    define PyArray_malloc PyMem_Malloc
-#    define PyArray_free PyMem_Free
-#    define PyArray_realloc PyMem_Realloc
-#  endif
-#else
-#define PyArray_malloc malloc
-#define PyArray_free free
-#define PyArray_realloc realloc
+#if !defined(PyArray_malloc)
+#    if NPY_USE_PYMEM == 1
+     /* numpy sometimes calls PyArray_malloc() with the GIL released. On Python
+        3.3 and older, it was safe to call PyMem_Malloc() with the GIL released.
+        On Python 3.4 and newer, it's better to use PyMem_RawMalloc() to be able
+        to use tracemalloc. On Python 3.6, calling PyMem_Malloc() with the GIL
+        released is now a fatal error in debug mode. */
+#       if PY_VERSION_HEX >= 0x03040000
+#          define PyArray_malloc PyMem_RawMalloc
+#          define PyArray_free PyMem_RawFree
+#          define PyArray_realloc PyMem_RawRealloc
+#       else
+#          define PyArray_malloc PyMem_Malloc
+#          define PyArray_free PyMem_Free
+#          define PyArray_realloc PyMem_Realloc
+#       endif
+#   else
+#      define PyArray_malloc malloc
+#      define PyArray_free free
+#      define PyArray_realloc realloc
+#   endif
 #endif
 
 /* Dimensions and strides */
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index fb0386e..486319e 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -614,6 +614,7 @@ def configuration(parent_package='',top_path=None):
     config.add_include_dirs(join('src', 'multiarray'))
     config.add_include_dirs(join('src', 'umath'))
     config.add_include_dirs(join('src', 'npysort'))
+    config.add_include_dirs(join('src', 'mkl_defs'))
 
     config.add_define_macros([("NPY_INTERNAL_BUILD", "1")]) # this macro indicates that Numpy build is in process
     config.add_define_macros([("HAVE_NPY_CONFIG_H", "1")])
@@ -712,6 +713,17 @@ def configuration(parent_package='',top_path=None):
                        sources=npysort_sources,
                        include_dirs=[])
 
+    #######################################################################
+    #                         mkl_cp library                              #
+    #######################################################################
+
+    # This library is created for the build but it is not installed
+    mkl_cp_sources = [join('src', 'mkl_defs', 'mkl_cpy.h'),
+                       join('src', 'mkl_defs', 'mkl_cpy.c')]
+    config.add_library('mkl_cp',
+                       sources=mkl_cp_sources,
+                       include_dirs=[])
+
     #######################################################################
     #                     multiarray_tests module                         #
     #######################################################################
@@ -869,6 +881,11 @@ def configuration(parent_package='',top_path=None):
             join('src', 'multiarray', 'vdot.c'),
             ]
 
+    aligned_alloc_sources = [
+        join('src', 'mkl_defs', 'aligned_alloc.h'),
+        join('src', 'mkl_defs', 'aligned_alloc.c')
+    ]
+
     #######################################################################
     #             _multiarray_umath module - umath part                   #
     #######################################################################
@@ -940,11 +957,11 @@ def configuration(parent_package='',top_path=None):
                                   join('*.py'),
                                   generate_umath_c,
                                   generate_ufunc_api,
-                                 ],
+                                 ] + aligned_alloc_sources,
                          extra_compile_args=['/Qstd=c99' if platform.system() == "Windows" else ''],
                          depends=deps + multiarray_deps + umath_deps +
                                 common_deps,
-                         libraries=['loops', 'npymath', 'npysort'],
+                         libraries=['loops', 'npymath', 'npysort', 'mkl_cp'],
                          extra_info=extra_info)
 
     #######################################################################
@@ -959,7 +976,10 @@ def configuration(parent_package='',top_path=None):
     #######################################################################
 
     config.add_extension('_rational_tests',
-                    sources=[join('src', 'umath', '_rational_tests.c.src')])
+                    sources=[join('src', 'umath', '_rational_tests.c.src')],
+                    libraries=['mkl_cp'],
+                    extra_info=extra_info
+                    )
 
     #######################################################################
     #                        struct_ufunc_test module                     #
diff --git a/numpy/core/src/common/numpyos.c b/numpy/core/src/common/numpyos.c
index d60b1ca..8decd18 100644
--- a/numpy/core/src/common/numpyos.c
+++ b/numpy/core/src/common/numpyos.c
@@ -25,6 +25,7 @@
 #endif
 
 
+#include "mkl_cpy.h"
 
 /*
  * From the C99 standard, section 7.19.6: The exponent always contains at least
diff --git a/numpy/core/src/common/ucsnarrow.c b/numpy/core/src/common/ucsnarrow.c
index 8e293e9..0d902b9 100644
--- a/numpy/core/src/common/ucsnarrow.c
+++ b/numpy/core/src/common/ucsnarrow.c
@@ -14,6 +14,7 @@
 
 #include "npy_pycompat.h"
 #include "ctors.h"
+#include "mkl_cpy.h"
 
 /*
  * Functions only needed on narrow builds of Python for converting back and
diff --git a/numpy/core/src/mkl_defs/aligned_alloc.c b/numpy/core/src/mkl_defs/aligned_alloc.c
new file mode 100644
index 0000000..81e9e33
--- /dev/null
+++ b/numpy/core/src/mkl_defs/aligned_alloc.c
@@ -0,0 +1,149 @@
+#include "mkl.h"
+#include <stdlib.h>
+#include <stddef.h>
+#ifndef Py_PYTHON_H
+#   include "Python.h"
+#endif
+#include "numpy/npy_common.h"
+
+#define ALIGNMENT 64
+#define __THRESHOLD 524288
+#define __UNIT_STRIDE 1
+#define __NULL_STRIDE 0
+#define __8BYTES_ALIGNMENT_OFFSET(ptr) (((size_t) (ptr)) & 0x7)
+#define MKL_INT_MAX ((size_t) (~((MKL_UINT) 0) >> 1))
+
+#if defined(_MSC_VER)
+#define posix_memalign(p, a, s) (((*(p)) = _aligned_malloc((s), (a))), *(p) ?0 :errno)
+#endif
+static int is_tbb_enabled(void) {
+    static int TBB_ENABLED = -1;
+    if (TBB_ENABLED == -1) {
+            char* mkl_threading = getenv("MKL_THREADING_LAYER");
+            TBB_ENABLED = (!mkl_threading || strncmp(mkl_threading, "TBB", 3) != 0 || strncmp(mkl_threading, "tbb", 3) != 0 ) ? 0 : 1;
+    }
+    return TBB_ENABLED;
+}
+
+static NPY_INLINE void call_dcopy_chunked(size_t size, double* src, double* dest) {
+   while (size > MKL_INT_MAX) {
+       cblas_dcopy(MKL_INT_MAX, src , __NULL_STRIDE, dest, __UNIT_STRIDE);
+       size -= MKL_INT_MAX;
+       dest += MKL_INT_MAX;
+   }
+   if (size > 0) {
+       if (size >= __THRESHOLD) {
+            cblas_dcopy(size, src , __NULL_STRIDE, dest, __UNIT_STRIDE);
+       } else {
+            memset(dest, 0, size * sizeof(double));
+       }
+   }
+}
+
+void * _aligned_alloc(size_t size) {
+    /* Only available for Linux and OSX (has been explicitly disabled on Windows : see aligned_alloc.h)
+     * With Windows, we would run into composability issues with modules like h5py which allocate
+     * memory using libc functions in another library, like hdf5 for instance
+     */
+    size = (size > 0) ? size : 1;
+    void* data = NULL;
+    int ret_code = posix_memalign(&data, ALIGNMENT, size);
+    if (ret_code == 0) {
+        return data;
+    }
+    return NULL;
+}
+
+
+#ifdef WITH_ALIGNED_CALLOC
+void * _aligned_calloc(size_t nelem, size_t elsize)
+{
+    size_t size = nelem * elsize;
+    void *data = _aligned_alloc(size);
+    char *memory = NULL;
+
+    if (data != NULL) {
+        memory = (char*) data;
+        if((size > __THRESHOLD) && !is_tbb_enabled()) {
+            size_t offset = __8BYTES_ALIGNMENT_OFFSET(8 - __8BYTES_ALIGNMENT_OFFSET(memory));
+            size_t rem_size, ch_size, n_ch = (size - offset) / sizeof(double);
+            double init = 0;
+            if (offset) {
+                memset(memory, 0, offset);
+            }
+
+            call_dcopy_chunked(n_ch, &init, (double*) (memory+offset));
+
+            ch_size = offset + n_ch * sizeof(double);
+            rem_size = size - ch_size;
+            if(rem_size > 0) {
+                memset(memory + ch_size, 0, rem_size);
+            }
+        } else {
+                memset(memory, 0, size);
+        }
+    }
+    return data;
+}
+#endif
+
+#if PY_VERSION_HEX >= 0x03040000
+static int is_tracemalloc_enabled(void) {
+    static int TRACEMALLOC_PRESENT = -1;
+    if (TRACEMALLOC_PRESENT == -1) {
+        TRACEMALLOC_PRESENT = (getenv("PYTHONTRACEMALLOC")) ? 1 : 0;
+    }
+    return TRACEMALLOC_PRESENT;
+}
+#endif
+
+void* call_aligned_malloc(size_t size) {
+#if PY_VERSION_HEX >= 0x03040000
+    if(is_tracemalloc_enabled()){
+        return PyMem_RawMalloc(size);
+    } else
+#endif
+    {
+        return _aligned_alloc(size);
+    }
+}
+
+void* call_aligned_realloc(void* input, size_t size) {
+#if PY_VERSION_HEX >= 0x03040000
+    if(is_tracemalloc_enabled()){
+        return PyMem_RawRealloc(input, size);
+    } else
+#endif
+    {
+        if (input) {
+          return realloc(input, size ? size : 1);
+        }
+        return _aligned_alloc(size);
+    }
+}
+
+void* call_aligned_calloc(size_t num, size_t size) {
+#if PY_VERSION_HEX >= 0x03040000
+    if(is_tracemalloc_enabled()){
+        return PyMem_RawCalloc(num, size);
+    } else
+#endif
+    {
+#ifdef WITH_ALIGNED_CALLOC
+        return _aligned_calloc(num, size);
+#else
+        return calloc(num, size);
+#endif
+    }
+}
+
+void call_free(void* ptr) {
+#if PY_VERSION_HEX >= 0x03040000
+    if(is_tracemalloc_enabled()){
+        PyMem_RawFree(ptr);
+    } else
+#endif
+    {
+        free(ptr);
+    }
+}
diff --git a/numpy/core/src/mkl_defs/aligned_alloc.h b/numpy/core/src/mkl_defs/aligned_alloc.h
new file mode 100644
index 0000000..edd0080
--- /dev/null
+++ b/numpy/core/src/mkl_defs/aligned_alloc.h
@@ -0,0 +1,11 @@
+#if !defined(ALIGNED_ALLOC_H) && !defined(_MSC_VER)
+#   define ALIGNED_ALLOC_H
+#   include <stddef.h>
+    inline void* call_aligned_malloc(size_t);
+    inline void* call_aligned_realloc(void*, size_t);
+    inline void* call_aligned_calloc(size_t, size_t);
+    inline void call_free(void*);
+#   define PyArray_malloc call_aligned_malloc 
+#   define PyArray_free call_free
+#   define PyArray_realloc call_aligned_realloc 
+#endif
diff --git a/numpy/core/src/mkl_defs/mkl_cpy.c b/numpy/core/src/mkl_defs/mkl_cpy.c
new file mode 100644
index 0000000..93c4fa0
--- /dev/null
+++ b/numpy/core/src/mkl_defs/mkl_cpy.c
@@ -0,0 +1,86 @@
+#include "mkl.h"
+#include <stddef.h>
+
+#define __THRESHOLD 262144
+#define __UNIT_STRIDE 1
+#define __8BYTES_ALIGNMENT_OFFSET(ptr) (((size_t) (ptr)) & 0x7)
+#define MKL_INT_MAX ((size_t) (~((MKL_UINT) 0) >> 1))
+#include "numpy/npy_common.h"
+
+static int is_tbb_enabled(void) {
+        static int TBB_ENABLED = -1;
+        if (TBB_ENABLED == -1) {
+                char* mkl_threading = getenv("MKL_THREADING_LAYER");
+                TBB_ENABLED = (!mkl_threading || strncmp(mkl_threading, "TBB", 3) != 0 || strncmp(mkl_threading, "tbb", 3) != 0 ) ? 0 : 1;
+        }
+        return TBB_ENABLED;
+}
+
+
+static NPY_INLINE void call_dcopy_chunked(size_t size, double* src, double* dest) {
+   while (size > MKL_INT_MAX) {
+       cblas_dcopy(MKL_INT_MAX, src , __UNIT_STRIDE, dest, __UNIT_STRIDE);
+       size -= MKL_INT_MAX;
+       src += MKL_INT_MAX;
+       dest += MKL_INT_MAX;
+   }
+   if (size > 0) {
+        if (size >= __THRESHOLD) {
+            cblas_dcopy(size, src , __UNIT_STRIDE, dest, __UNIT_STRIDE);
+        } else {
+            memmove(dest, src, size * sizeof(double));
+        }
+   }
+}
+
+//#define __DEBUG
+void call_mkl_mv(void* destination, const void* source, size_t size, const char* file_name, const char* func_name, const int line_num) {
+        char* dst = (char*) destination;
+        const char* src = (const char*) source;
+        if((size > __THRESHOLD) && (__8BYTES_ALIGNMENT_OFFSET(src) == __8BYTES_ALIGNMENT_OFFSET(dst)) && ((dst + size < src) || (src + size < dst)) && !is_tbb_enabled()) {
+            // memory segments do not intersect, use threaded MKL BLAS copying functions
+            size_t offset = __8BYTES_ALIGNMENT_OFFSET(8 - __8BYTES_ALIGNMENT_OFFSET(src));
+            size_t rem_size, ch_size, n_ch = (size - offset) / sizeof(double);
+            if (offset) {
+                memmove(dst, src, offset);
+            }
+
+#           ifdef __DEBUG
+            printf("%zu bytes to mkl-mv : %s (%s:%d)\n", size, func_name, file_name, line_num);
+#           endif
+            call_dcopy_chunked(n_ch, (double *) (src + offset), (double*) (dst + offset));
+
+            ch_size = offset + n_ch * sizeof(double);
+            rem_size = size - ch_size;
+            if(rem_size > 0) {
+                memmove(dst + ch_size, src + ch_size, rem_size);
+            }
+        } else {
+                memmove(dst, src, size);
+        }
+}
+
+void call_mkl_cpy(void* destination, const void* source, size_t size, const char* file_name, const char* func_name, const int line_num) {
+        char* dst = (char*) destination;
+        const char* src = (const char*) source;
+        if((size > __THRESHOLD) && (__8BYTES_ALIGNMENT_OFFSET(src) == __8BYTES_ALIGNMENT_OFFSET(dst)) && !is_tbb_enabled()) {
+            size_t offset = __8BYTES_ALIGNMENT_OFFSET(8 - __8BYTES_ALIGNMENT_OFFSET(src));
+            size_t rem_size, ch_size, n_ch = (size - offset) / sizeof(double);
+            if (offset) {
+                memcpy(dst, src, offset);
+            }
+
+#           ifdef __DEBUG
+            printf("%zu bytes to mkl-cp : %s (%s:%d)\n", size, func_name, file_name, line_num);
+#           endif
+            call_dcopy_chunked(n_ch, (double *) (src + offset), (double*) (dst + offset));
+
+            ch_size = offset + n_ch * sizeof(double);
+            rem_size = size - ch_size;
+            if(rem_size > 0) {
+                memmove(dst + ch_size, src + ch_size, rem_size);
+            }
+        } else {
+                memcpy(dst, src, size);
+        }
+}
diff --git a/numpy/core/src/mkl_defs/mkl_cpy.h b/numpy/core/src/mkl_defs/mkl_cpy.h
new file mode 100644
index 0000000..74af118
--- /dev/null
+++ b/numpy/core/src/mkl_defs/mkl_cpy.h
@@ -0,0 +1,10 @@
+#ifdef __has_include
+#   if __has_include("mkl.h") && !defined(MKL_CPY_H)
+#       define MKL_CPY_H
+#       include <stddef.h>
+        void call_mkl_mv(void*, const void*, size_t, const char*, const char*, const int); 
+        void call_mkl_cpy(void*, const void*, size_t, const char*, const char*, const int);
+#       define memcpy(src, dst, size) call_mkl_cpy(src, dst, size, __FILE__, __func__, __LINE__)
+#       define memmove(src, dst, size) call_mkl_mv(src, dst, size, __FILE__, __func__, __LINE__)
+#   endif
+#endif
diff --git a/numpy/core/src/multiarray/alloc.c b/numpy/core/src/multiarray/alloc.c
index 6755095..6e84e81 100644
--- a/numpy/core/src/multiarray/alloc.c
+++ b/numpy/core/src/multiarray/alloc.c
@@ -1,6 +1,7 @@
 #define PY_SSIZE_T_CLEAN
 #include <Python.h>
 #include "structmember.h"
+#include "aligned_alloc.h"
 
 #if PY_VERSION_HEX >= 0x03060000
 #include <pymem.h>
@@ -86,6 +87,7 @@ _npy_alloc_cache(npy_uintp nelem, npy_uintp esz, npy_uint msz,
     return p;
 }
 
+
 /*
  * return pointer p to cache, nelem is number of elements of the cache bucket
  * size (1 or sizeof(npy_intp)) of the block pointed too
@@ -153,6 +155,7 @@ npy_alloc_cache_dim(npy_uintp sz)
     if (sz < 2) {
         sz = 2;
     }
+
     return _npy_alloc_cache(sz, sizeof(npy_intp), NBUCKETS_DIM, dimcache,
                             &PyArray_malloc);
 }
@@ -164,6 +167,7 @@ npy_free_cache_dim(void * p, npy_uintp sz)
     if (sz < 2) {
         sz = 2;
     }
+
     _npy_free_cache(p, sz, NBUCKETS_DIM, dimcache,
                     &PyArray_free);
 }
@@ -218,7 +222,11 @@ PyDataMem_NEW(size_t size)
 {
     void *result;
 
+#ifdef ALIGNED_ALLOC_H
+    result = call_aligned_malloc(size);
+#else
     result = malloc(size);
+#endif
     if (_PyDataMem_eventhook != NULL) {
         NPY_ALLOW_C_API_DEF
         NPY_ALLOW_C_API
@@ -240,7 +248,11 @@ PyDataMem_NEW_ZEROED(size_t size, size_t elsize)
 {
     void *result;
 
+#ifdef ALIGNED_ALLOC_H
+    result = call_aligned_calloc(size, elsize);
+#else
     result = calloc(size, elsize);
+#endif
     if (_PyDataMem_eventhook != NULL) {
         NPY_ALLOW_C_API_DEF
         NPY_ALLOW_C_API
@@ -261,7 +273,11 @@ NPY_NO_EXPORT void
 PyDataMem_FREE(void *ptr)
 {
     PyTraceMalloc_Untrack(NPY_TRACE_DOMAIN, (npy_uintp)ptr);
+#ifdef ALIGNED_ALLOC_H
+    call_free(ptr);
+#else
     free(ptr);
+#endif
     if (_PyDataMem_eventhook != NULL) {
         NPY_ALLOW_C_API_DEF
         NPY_ALLOW_C_API
@@ -281,7 +297,11 @@ PyDataMem_RENEW(void *ptr, size_t size)
 {
     void *result;
 
+#ifdef ALIGNED_ALLOC_H
+    result = call_aligned_realloc(ptr, size);
+#else
     result = realloc(ptr, size);
+#endif
     if (result != ptr) {
         PyTraceMalloc_Untrack(NPY_TRACE_DOMAIN, (npy_uintp)ptr);
     }
diff --git a/numpy/core/src/multiarray/array_assign_scalar.c b/numpy/core/src/multiarray/array_assign_scalar.c
index ecb5be4..6995bbf 100644
--- a/numpy/core/src/multiarray/array_assign_scalar.c
+++ b/numpy/core/src/multiarray/array_assign_scalar.c
@@ -9,6 +9,7 @@
 
 #define PY_SSIZE_T_CLEAN
 #include <Python.h>
+#include "aligned_alloc.h"
 
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
 #define _MULTIARRAYMODULE
diff --git a/numpy/core/src/multiarray/arrayobject.c b/numpy/core/src/multiarray/arrayobject.c
index 97aaee9..b5a8215 100644
--- a/numpy/core/src/multiarray/arrayobject.c
+++ b/numpy/core/src/multiarray/arrayobject.c
@@ -23,6 +23,7 @@ maintainer email:  oliphant.travis@ieee.org
 #define PY_SSIZE_T_CLEAN
 #include <Python.h>
 #include "structmember.h"
+#include "aligned_alloc.h"
 
 /*#include <stdio.h>*/
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
@@ -53,6 +54,7 @@ maintainer email:  oliphant.travis@ieee.org
 #include "alloc.h"
 #include "mem_overlap.h"
 #include "numpyos.h"
+#include "mkl_cpy.h"
 #include "strfuncs.h"
 
 #include "binop_override.h"
diff --git a/numpy/core/src/multiarray/arraytypes.c.src b/numpy/core/src/multiarray/arraytypes.c.src
index 823ee71..7f2439d 100644
--- a/numpy/core/src/multiarray/arraytypes.c.src
+++ b/numpy/core/src/multiarray/arraytypes.c.src
@@ -9,6 +9,7 @@
 #define _MULTIARRAYMODULE
 #define _NPY_NO_DEPRECATIONS /* for NPY_CHAR */
 
+#include "aligned_alloc.h"
 #include "numpy/npy_common.h"
 #include "numpy/arrayobject.h"
 #include "numpy/arrayscalars.h"
@@ -1971,6 +1972,7 @@ static void
 
 /**end repeat**/
 
+
 /**begin repeat
  *
  * #fname = BOOL,
diff --git a/numpy/core/src/multiarray/compiled_base.c b/numpy/core/src/multiarray/compiled_base.c
index 10e3478..bacf99d 100644
--- a/numpy/core/src/multiarray/compiled_base.c
+++ b/numpy/core/src/multiarray/compiled_base.c
@@ -2,12 +2,14 @@
 #include <Python.h>
 #include <structmember.h>
 #include <string.h>
+#include "aligned_alloc.h"
 
 #define _MULTIARRAYMODULE
 #include "numpy/arrayobject.h"
 #include "numpy/npy_3kcompat.h"
 #include "numpy/npy_math.h"
 #include "npy_config.h"
+#include "mkl_cpy.h"
 #include "templ_common.h" /* for npy_mul_with_overflow_intp */
 #include "lowlevel_strided_loops.h" /* for npy_bswap8 */
 #include "alloc.h"
diff --git a/numpy/core/src/multiarray/convert.c b/numpy/core/src/multiarray/convert.c
index 7db4673..354e1da 100644
--- a/numpy/core/src/multiarray/convert.c
+++ b/numpy/core/src/multiarray/convert.c
@@ -22,6 +22,7 @@
 #include "array_assign.h"
 
 #include "convert.h"
+#include "mkl_cpy.h"
 
 int
 fallocate(int fd, int mode, off_t offset, off_t len);
diff --git a/numpy/core/src/multiarray/convert_datatype.c b/numpy/core/src/multiarray/convert_datatype.c
index 33a7064..6a374aa 100644
--- a/numpy/core/src/multiarray/convert_datatype.c
+++ b/numpy/core/src/multiarray/convert_datatype.c
@@ -19,6 +19,7 @@
 #include "convert_datatype.h"
 #include "_datetime.h"
 #include "datetime_strings.h"
+#include "mkl_cpy.h"
 
 
 /*
diff --git a/numpy/core/src/multiarray/datetime.c b/numpy/core/src/multiarray/datetime.c
index a8550d9..82e874e 100644
--- a/numpy/core/src/multiarray/datetime.c
+++ b/numpy/core/src/multiarray/datetime.c
@@ -15,6 +15,7 @@
 
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
 #define _MULTIARRAYMODULE
+#include "aligned_alloc.h"
 #include <numpy/arrayobject.h>
 
 #include "npy_config.h"
@@ -25,6 +26,7 @@
 #include "methods.h"
 #include "_datetime.h"
 #include "datetime_strings.h"
+#include "mkl_cpy.h"
 
 /*
  * Imports the PyDateTime functions so we can create these objects.
diff --git a/numpy/core/src/multiarray/datetime_busday.c b/numpy/core/src/multiarray/datetime_busday.c
index c04a6c1..df58067 100644
--- a/numpy/core/src/multiarray/datetime_busday.c
+++ b/numpy/core/src/multiarray/datetime_busday.c
@@ -22,6 +22,7 @@
 #include "_datetime.h"
 #include "datetime_busday.h"
 #include "datetime_busdaycal.h"
+#include "mkl_cpy.h"
 
 /* Gets the day of the week for a datetime64[D] value */
 static int
diff --git a/numpy/core/src/multiarray/datetime_busdaycal.c b/numpy/core/src/multiarray/datetime_busdaycal.c
index 7a26868..561133c 100644
--- a/numpy/core/src/multiarray/datetime_busdaycal.c
+++ b/numpy/core/src/multiarray/datetime_busdaycal.c
@@ -13,6 +13,7 @@
 
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
 #define _MULTIARRAYMODULE
+#include "aligned_alloc.h"
 #include <numpy/arrayobject.h>
 
 #include "npy_config.h"
@@ -24,6 +25,7 @@
 #include "_datetime.h"
 #include "datetime_busday.h"
 #include "datetime_busdaycal.h"
+#include "mkl_cpy.h"
 
 NPY_NO_EXPORT int
 PyArray_WeekMaskConverter(PyObject *weekmask_in, npy_bool *weekmask)
diff --git a/numpy/core/src/multiarray/datetime_strings.c b/numpy/core/src/multiarray/datetime_strings.c
index 95b7bb3..9c135cb 100644
--- a/numpy/core/src/multiarray/datetime_strings.c
+++ b/numpy/core/src/multiarray/datetime_strings.c
@@ -23,6 +23,7 @@
 #include "methods.h"
 #include "_datetime.h"
 #include "datetime_strings.h"
+#include "mkl_cpy.h"
 
 /*
  * Platform-specific time_t typedef. Some platforms use 32 bit, some use 64 bit
diff --git a/numpy/core/src/multiarray/descriptor.c b/numpy/core/src/multiarray/descriptor.c
index 3038e4d..a32254d 100644
--- a/numpy/core/src/multiarray/descriptor.c
+++ b/numpy/core/src/multiarray/descriptor.c
@@ -6,6 +6,7 @@
 
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
 #define _MULTIARRAYMODULE
+#include "aligned_alloc.h"
 #include "numpy/arrayobject.h"
 #include "numpy/arrayscalars.h"
 
@@ -17,6 +18,7 @@
 #include "common.h"
 #include "templ_common.h" /* for npy_mul_with_overflow_intp */
 #include "descriptor.h"
+#include "mkl_cpy.h"
 #include "alloc.h"
 #include "assert.h"
 #include "buffer.h"
diff --git a/numpy/core/src/multiarray/dtype_transfer.c b/numpy/core/src/multiarray/dtype_transfer.c
index 63b1ead..a2d5769 100644
--- a/numpy/core/src/multiarray/dtype_transfer.c
+++ b/numpy/core/src/multiarray/dtype_transfer.c
@@ -13,6 +13,7 @@
 #define PY_SSIZE_T_CLEAN
 #include "Python.h"
 #include "structmember.h"
+#include "aligned_alloc.h"
 
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
 #define _MULTIARRAYMODULE
@@ -30,6 +31,7 @@
 
 #include "shape.h"
 #include "lowlevel_strided_loops.h"
+#include "mkl_cpy.h"
 #include "alloc.h"
 
 #define NPY_LOWLEVEL_BUFFER_BLOCKSIZE  128
diff --git a/numpy/core/src/multiarray/einsum.c.src b/numpy/core/src/multiarray/einsum.c.src
index 1765982..d8bf721 100644
--- a/numpy/core/src/multiarray/einsum.c.src
+++ b/numpy/core/src/multiarray/einsum.c.src
@@ -48,6 +48,8 @@
 #include <emmintrin.h>
 #endif
 
+#include "mkl_cpy.h"
+
 #define EINSUM_IS_SSE_ALIGNED(x) ((((npy_intp)x)&0xf) == 0)
 
 /********** PRINTF DEBUG TRACING **************/
diff --git a/numpy/core/src/multiarray/flagsobject.c b/numpy/core/src/multiarray/flagsobject.c
index 85ea49f..bbd2c61 100644
--- a/numpy/core/src/multiarray/flagsobject.c
+++ b/numpy/core/src/multiarray/flagsobject.c
@@ -15,6 +15,7 @@
 #include "array_assign.h"
 
 #include "common.h"
+#include "mkl_cpy.h"
 
 static void
 _UpdateContiguousFlags(PyArrayObject *ap);
diff --git a/numpy/core/src/multiarray/getset.c b/numpy/core/src/multiarray/getset.c
index 24962da..0e1b84a 100644
--- a/numpy/core/src/multiarray/getset.c
+++ b/numpy/core/src/multiarray/getset.c
@@ -6,6 +6,7 @@
 
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
 #define _MULTIARRAYMODULE
+#include "aligned_alloc.h"
 #include "numpy/arrayobject.h"
 
 #include "npy_config.h"
@@ -19,6 +20,7 @@
 #include "getset.h"
 #include "arrayobject.h"
 #include "mem_overlap.h"
+#include "mkl_cpy.h"
 #include "alloc.h"
 #include "buffer.h"
 
diff --git a/numpy/core/src/multiarray/item_selection.c b/numpy/core/src/multiarray/item_selection.c
index a7c6b14..7c72d49 100644
--- a/numpy/core/src/multiarray/item_selection.c
+++ b/numpy/core/src/multiarray/item_selection.c
@@ -4,6 +4,7 @@
 
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
 #define _MULTIARRAYMODULE
+#include "aligned_alloc.h"
 #include "numpy/arrayobject.h"
 #include "numpy/arrayscalars.h"
 
@@ -25,8 +26,11 @@
 #include "npy_sort.h"
 #include "npy_partition.h"
 #include "npy_binsearch.h"
+#include "mkl_cpy.c"
 #include "alloc.h"
 
+#define memmove(src, dst, size) call_mkl_mv(src, dst, size, __FILE__, __func__, __LINE__)
+
 /*NUMPY_API
  * Take
  */
@@ -626,14 +630,28 @@ PyArray_Repeat(PyArrayObject *aop, PyObject *op, int axis)
     for (i = 0; i < axis; i++) {
         n_outer *= PyArray_DIMS(aop)[i];
     }
-    for (i = 0; i < n_outer; i++) {
-        for (j = 0; j < n; j++) {
-            npy_intp tmp = broadcast ? counts[0] : counts[j];
-            for (k = 0; k < tmp; k++) {
-                memcpy(new_data, old_data, chunk);
-                new_data += chunk;
+    
+    if(chunk > __THRESHOLD) {
+        for (i = 0; i < n_outer; i++) {
+            for (j = 0; j < n; j++) {
+                npy_intp tmp = broadcast ? counts[0] : counts[j];
+                for (k = 0; k < tmp; k++) {
+                    call_mkl_cpy(new_data, old_data, chunk, __FILE__, __func__, __LINE__);
+                    new_data += chunk;
+                }
+                old_data += chunk;
+            }
+        }
+    } else {
+        for (i = 0; i < n_outer; i++) {
+            for (j = 0; j < n; j++) {
+                npy_intp tmp = broadcast ? counts[0] : counts[j];
+                for (k = 0; k < tmp; k++) {
+                    memcpy(new_data, old_data, chunk);
+                    new_data += chunk;
+                }
+                old_data += chunk;
             }
-            old_data += chunk;
         }
     }
 
diff --git a/numpy/core/src/multiarray/iterators.c b/numpy/core/src/multiarray/iterators.c
index a3bc8e7..cdbf482 100644
--- a/numpy/core/src/multiarray/iterators.c
+++ b/numpy/core/src/multiarray/iterators.c
@@ -4,6 +4,7 @@
 
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
 #define _MULTIARRAYMODULE
+#include "aligned_alloc.h"
 #include "numpy/arrayobject.h"
 #include "numpy/arrayscalars.h"
 
@@ -15,6 +16,7 @@
 #include "iterators.h"
 #include "ctors.h"
 #include "common.h"
+#include "mkl_cpy.h"
 
 #define NEWAXIS_INDEX -1
 #define ELLIPSIS_INDEX -2
diff --git a/numpy/core/src/multiarray/lowlevel_strided_loops.c.src b/numpy/core/src/multiarray/lowlevel_strided_loops.c.src
index 16bacf1..85c498b 100644
--- a/numpy/core/src/multiarray/lowlevel_strided_loops.c.src
+++ b/numpy/core/src/multiarray/lowlevel_strided_loops.c.src
@@ -16,6 +16,7 @@
 #include <numpy/arrayobject.h>
 #include <numpy/npy_cpu.h>
 #include <numpy/halffloat.h>
+#include "mkl_cpy.h"
 
 #include "lowlevel_strided_loops.h"
 #include "array_assign.h"
diff --git a/numpy/core/src/multiarray/mapping.c b/numpy/core/src/multiarray/mapping.c
index 17edd2b..c436e1f 100644
--- a/numpy/core/src/multiarray/mapping.c
+++ b/numpy/core/src/multiarray/mapping.c
@@ -1,6 +1,7 @@
 #define PY_SSIZE_T_CLEAN
 #include <Python.h>
 #include "structmember.h"
+#include "aligned_alloc.h"
 
 /*#include <stdio.h>*/
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
diff --git a/numpy/core/src/multiarray/methods.c b/numpy/core/src/multiarray/methods.c
index 7c814e6..e14a73c 100644
--- a/numpy/core/src/multiarray/methods.c
+++ b/numpy/core/src/multiarray/methods.c
@@ -24,6 +24,7 @@
 #include "array_assign.h"
 
 #include "methods.h"
+#include "mkl_cpy.h"
 #include "alloc.h"
 
 
diff --git a/numpy/core/src/multiarray/multiarraymodule.c b/numpy/core/src/multiarray/multiarraymodule.c
index 8135769..5a0fa59 100644
--- a/numpy/core/src/multiarray/multiarraymodule.c
+++ b/numpy/core/src/multiarray/multiarraymodule.c
@@ -17,6 +17,7 @@
 #define PY_SSIZE_T_CLEAN
 #include "Python.h"
 #include "structmember.h"
+#include "aligned_alloc.h"
 
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
 #define _UMATHMODULE
diff --git a/numpy/core/src/multiarray/nditer_api.c b/numpy/core/src/multiarray/nditer_api.c
index 7a33ac0..ce0090f 100644
--- a/numpy/core/src/multiarray/nditer_api.c
+++ b/numpy/core/src/multiarray/nditer_api.c
@@ -13,9 +13,11 @@
 
 /* Indicate that this .c file is allowed to include the header */
 #define NPY_ITERATOR_IMPLEMENTATION_CODE
+#include "aligned_alloc.h"
 #include "nditer_impl.h"
 #include "templ_common.h"
 #include "ctors.h"
+#include "mkl_cpy.h"
 
 /* Internal helper functions private to this file */
 static npy_intp
diff --git a/numpy/core/src/multiarray/nditer_constr.c b/numpy/core/src/multiarray/nditer_constr.c
index 90cff40..1472e8e 100644
--- a/numpy/core/src/multiarray/nditer_constr.c
+++ b/numpy/core/src/multiarray/nditer_constr.c
@@ -13,11 +13,13 @@
 
 /* Indicate that this .c file is allowed to include the header */
 #define NPY_ITERATOR_IMPLEMENTATION_CODE
+#include "aligned_alloc.h"
 #include "nditer_impl.h"
 
 #include "arrayobject.h"
 #include "templ_common.h"
 #include "array_assign.h"
+#include "mkl_cpy.h"
 
 /* Internal helper functions private to this file */
 static int
diff --git a/numpy/core/src/multiarray/nditer_templ.c.src b/numpy/core/src/multiarray/nditer_templ.c.src
index 0f0d599..e143fd6 100644
--- a/numpy/core/src/multiarray/nditer_templ.c.src
+++ b/numpy/core/src/multiarray/nditer_templ.c.src
@@ -11,6 +11,7 @@
 /* Indicate that this .c file is allowed to include the header */
 #define NPY_ITERATOR_IMPLEMENTATION_CODE
 #include "nditer_impl.h"
+#include "mkl_cpy.h"
 
 /* SPECIALIZED iternext functions that handle the non-buffering part */
 
diff --git a/numpy/core/src/multiarray/scalarapi.c b/numpy/core/src/multiarray/scalarapi.c
index bc435d1..deab1af 100644
--- a/numpy/core/src/multiarray/scalarapi.c
+++ b/numpy/core/src/multiarray/scalarapi.c
@@ -18,6 +18,7 @@
 #include "scalartypes.h"
 
 #include "common.h"
+#include "mkl_cpy.h"
 
 static PyArray_Descr *
 _descr_from_subtype(PyObject *type)
diff --git a/numpy/core/src/multiarray/scalartypes.c.src b/numpy/core/src/multiarray/scalartypes.c.src
index 2f71c8a..c08c9ea 100644
--- a/numpy/core/src/multiarray/scalartypes.c.src
+++ b/numpy/core/src/multiarray/scalartypes.c.src
@@ -8,6 +8,7 @@
 #define _MULTIARRAYMODULE
 #endif
 
+#include "aligned_alloc.h"
 #include "numpy/arrayobject.h"
 #include "numpy/npy_math.h"
 #include "numpy/halffloat.h"
@@ -31,6 +32,7 @@
 #include "buffer.h"
 
 #include <stdlib.h>
+#include "mkl_cpy.h"
 
 #include "binop_override.h"
 
diff --git a/numpy/core/src/multiarray/sequence.c b/numpy/core/src/multiarray/sequence.c
index 4769bda..e5415e3 100644
--- a/numpy/core/src/multiarray/sequence.c
+++ b/numpy/core/src/multiarray/sequence.c
@@ -16,6 +16,7 @@
 
 #include "sequence.h"
 #include "calculation.h"
+#include "mkl_cpy.h"
 
 /*************************************************************************
  ****************   Implement Sequence Protocol **************************
diff --git a/numpy/core/src/multiarray/shape.c b/numpy/core/src/multiarray/shape.c
index 3082073..7422173 100644
--- a/numpy/core/src/multiarray/shape.c
+++ b/numpy/core/src/multiarray/shape.c
@@ -16,6 +16,7 @@
 #include "ctors.h"
 
 #include "shape.h"
+#include "mkl_cpy.h"
 
 #include "multiarraymodule.h" /* for interned strings */
 #include "templ_common.h" /* for npy_mul_with_overflow_intp */
diff --git a/numpy/core/src/npysort/npysort_common.h b/numpy/core/src/npysort/npysort_common.h
index a22045b..17c649a 100644
--- a/numpy/core/src/npysort/npysort_common.h
+++ b/numpy/core/src/npysort/npysort_common.h
@@ -3,6 +3,7 @@
 
 #include <stdlib.h>
 #include <numpy/ndarraytypes.h>
+#include "mkl_cpy.h"
 
 /*
  *****************************************************************************
diff --git a/numpy/core/src/umath/_rational_tests.c.src b/numpy/core/src/umath/_rational_tests.c.src
index 9e74845..f98a0ad 100644
--- a/numpy/core/src/umath/_rational_tests.c.src
+++ b/numpy/core/src/umath/_rational_tests.c.src
@@ -8,6 +8,7 @@
 #include <numpy/ufuncobject.h>
 #include <numpy/npy_3kcompat.h>
 #include <math.h>
+#include "mkl_cpy.h"
 
 #include "common.h"  /* for error_converting */
 
diff --git a/numpy/core/src/umath/reduction.c b/numpy/core/src/umath/reduction.c
index 6d04ce3..eb12b4e 100644
--- a/numpy/core/src/umath/reduction.c
+++ b/numpy/core/src/umath/reduction.c
@@ -23,6 +23,7 @@
 #include "lowlevel_strided_loops.h"
 #include "reduction.h"
 #include "extobj.h"  /* for _check_ufunc_fperr */
+#include "mkl_cpy.h"
 
 /*
  * Allocates a result array for a reduction operation, with
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index 4bb8569..114a98b 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -31,6 +31,7 @@
 #include <stdlib.h>
 #include <float.h>
 #include <string.h> /* for memcpy */
+#include "mkl_cpy.h"
 
 #define VECTOR_SIZE_BYTES 16
 
diff --git a/numpy/core/src/umath/ufunc_object.c b/numpy/core/src/umath/ufunc_object.c
index a2df586..35c5ca6 100644
--- a/numpy/core/src/umath/ufunc_object.c
+++ b/numpy/core/src/umath/ufunc_object.c
@@ -28,6 +28,7 @@
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
 
 #include "Python.h"
+#include "aligned_alloc.h"
 
 #include "npy_config.h"
 
@@ -47,6 +48,7 @@
 #include "extobj.h"
 #include "common.h"
 #include "numpyos.h"
+#include "mkl_cpy.h"
 
 /********** PRINTF DEBUG TRACING **************/
 #define NPY_UF_DBG_TRACING 0
diff --git a/numpy/core/src/umath/ufunc_type_resolution.c b/numpy/core/src/umath/ufunc_type_resolution.c
index ec60d9c..e876d33 100644
--- a/numpy/core/src/umath/ufunc_type_resolution.c
+++ b/numpy/core/src/umath/ufunc_type_resolution.c
@@ -13,6 +13,7 @@
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
 
 #include "Python.h"
+#include "aligned_alloc.h"
 
 #include "npy_config.h"
 #include "npy_pycompat.h"
@@ -21,6 +22,7 @@
 #include "ufunc_type_resolution.h"
 #include "ufunc_object.h"
 #include "common.h"
+#include "mkl_cpy.h"
 
 #include "mem_overlap.h"
 #if defined(HAVE_CBLAS)
diff --git a/numpy/core/src/umath/umathmodule.c b/numpy/core/src/umath/umathmodule.c
index 5de19fe..0f5fb38 100644
--- a/numpy/core/src/umath/umathmodule.c
+++ b/numpy/core/src/umath/umathmodule.c
@@ -21,6 +21,7 @@
 
 #include "Python.h"
 
+#include "aligned_alloc.h"
 #include "npy_config.h"
 
 #include "numpy/arrayobject.h"
@@ -30,6 +31,7 @@
 
 #include "numpy/npy_math.h"
 #include "number.h"
+#include "mkl_cpy.h"
 
 static PyUFuncGenericFunction pyfunc_functions[] = {PyUFunc_On_Om};
 
diff --git a/numpy/core/tests/test_regression.py b/numpy/core/tests/test_regression.py
index 2421a11..d3c6124 100644
--- a/numpy/core/tests/test_regression.py
+++ b/numpy/core/tests/test_regression.py
@@ -1330,6 +1330,17 @@ class TestRegression(object):
             np.arange(sz)
             assert_(np.size == sz)
 
+    @dec.skipif(True, reason="Memory hog. Skip to save infrastructure resources")
+    def test_huge_array_copy(self):
+        #check behavior of altered memcpy/memmove when size of array > 2**sizeof(int)
+        try:
+            arr1 = np.ones([715827883, 3], np.double)
+            arr2 = np.copy(arr1)
+            assert_equal(arr1, arr2)
+        except MemoryError as e:
+            #if cannot allocate memory, let the test pass
+            pass
+
     def test_fromiter_bytes(self):
         # Ticket #1058
         a = np.fromiter(list(range(10)), dtype='b')
diff --git a/numpy/random/setup.py b/numpy/random/setup.py
index 394a70e..c6c92cd 100644
--- a/numpy/random/setup.py
+++ b/numpy/random/setup.py
@@ -4,6 +4,7 @@ from os.path import join
 import sys
 from distutils.dep_util import newer
 from distutils.msvccompiler import get_build_version as get_msvc_build_version
+from numpy.distutils.system_info import get_info
 
 def needs_mingw_ftime_workaround():
     # We need the mingw workaround for _ftime if the msvc runtime version is
@@ -39,7 +40,7 @@ def configuration(parent_package='',top_path=None):
     if needs_mingw_ftime_workaround():
         defs.append(("NPY_NEEDS_MINGW_TIME_WORKAROUND", None))
 
-    libs = []
+    libs = ['mkl_cp']
     # Configure mtrand
     config.add_extension('mtrand',
                          sources=[join('mtrand', x) for x in
@@ -50,6 +51,7 @@ def configuration(parent_package='',top_path=None):
                                   join('mtrand', '*.pyx'),
                                   join('mtrand', '*.pxi'),],
                          define_macros=defs,
+                         extra_info=get_info('mkl')
                          )
 
     config.add_data_files(('.', join('mtrand', 'randomkit.h')))
-- 
2.20.1

