speed.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Numpy vectorizing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def root(a, b, c):\n",
    "    D = b ** 2 - 4 * a * c\n",
    "    if D < 0:\n",
    "        return np.nan, np.nan\n",
    "    x1 = (-b + np.sqrt(D)) / (2 * a)\n",
    "    x2 = (-b - np.sqrt(D)) / (2 * a)\n",
    "    return x1, x2\n",
    "root(1, 3, 4), root(-1, 3, 4), root(1, 0, 0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "a = np.random.randn(int(1e6))\n",
    "b = np.random.randn(int(1e6))\n",
    "c = np.random.randn(int(1e6))\n",
    "root(a, b, c)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "np.array([root(av, bv, cv) for av, bv, cv in zip(a, b, c)])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "root_vec = np.vectorize(root)\n",
    "np.vectorize?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "root_vec(a, b, c)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "root_vec(a, b, 7)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Numba"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import numba"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "root_jit = numba.jit(root)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%timeit root(23, 78, 19.0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%timeit root_jit(23, 78, 19.0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "root_jit(a, b, 7)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "root_jit_vec = np.vectorize(root_jit)\n",
    "%time root_jit_vec(a, b, 7)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "@np.vectorize\n",
    "@numba.jit\n",
    "def fast_root(a, b, c):\n",
    "    D = b ** 2 - 4 * a * c\n",
    "    if D < 0:\n",
    "        return np.nan, np.nan\n",
    "    x1 = (-b + np.sqrt(D)) / (2 * a)\n",
    "    x2 = (-b - np.sqrt(D)) / (2 * a)\n",
    "    return x1, x2\n",
    "%time fast_root(a, b, 7)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "@numba.vectorize\n",
    "def fast_root(a, b, c):\n",
    "    D = b ** 2 - 4 * a * c\n",
    "    if D < 0:\n",
    "        return np.nan, np.nan\n",
    "    x1 = (-b + np.sqrt(D)) / (2 * a)\n",
    "    x2 = (-b - np.sqrt(D)) / (2 * a)\n",
    "    return x1, x2\n",
    "%time fast_root(a, b, 7)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# But run the profiler first"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def fib(n):\n",
    "    if n <= 1:\n",
    "        return 1\n",
    "    else:\n",
    "        return fib(n-2) + fib(n-1)\n",
    "%time fib(33)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "@numba.jit\n",
    "def fib_jit(n):\n",
    "    if n <= 1:\n",
    "        return 1\n",
    "    else:\n",
    "        return fib_jit(n-2) + fib_jit(n-1)\n",
    "%time fib_jit(33)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%time fib_jit(41)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%prun fib_jit(41)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%prun fib(33)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from functools import lru_cache\n",
    "@lru_cache(None)\n",
    "def fib_cache(n):\n",
    "    if n <= 1:\n",
    "        return 1\n",
    "    else:\n",
    "        return fib_cache(n-2) + fib_cache(n-1)\n",
    "%time fib_cache(33)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%time fib_cache(500)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Optimizing numpy algebra"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Multiplying/adding arrays incurs a lot of overhead, because lots of intermediate arrays get created and discarded again"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "a = np.random.randn(int(1e7))\n",
    "b = np.random.randn(int(1e7))\n",
    "%timeit a * b + 4.1 * a < 3 * b"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numexpr as ne\n",
    "%timeit ne.evaluate('a * b + 4.1 * a < 3 * b')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Efficiency gain reduces once the operations become more complex"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "c = np.random.randn(int(1e7))\n",
    "%timeit (-b - np.sqrt(b ** 2 - 4 * a * c) ) / (2 * a)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%timeit ne.evaluate('(-b - sqrt(b ** 2 - 4 * a * c) ) / (2 * a)')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Cython"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "http://docs.cython.org/en/latest/\n",
    "\n",
    "Compiles most python programs to C code for extra speed (but less introspection). Also allows direct calling of C library code.\n",
    "\n",
    "Can include python classes and much more."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%load_ext cython"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def primes(kmax):\n",
    "    p = {}\n",
    "    result = []\n",
    "    if kmax > 1000:\n",
    "        kmax = 1000\n",
    "    k = 0\n",
    "    n = 2\n",
    "    while k < kmax:\n",
    "        i = 0\n",
    "        while i < k and n % p[i] != 0:\n",
    "            i = i + 1\n",
    "        if i == k:\n",
    "            p[k] = n\n",
    "            k = k + 1\n",
    "            result.append(n)\n",
    "        n = n + 1\n",
    "    return result\n",
    "print(primes(10))\n",
    "%time _ = primes(10000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%cython -a\n",
    "def cprimes(kmax):\n",
    "    p = {}\n",
    "    result = []\n",
    "    if kmax > 1000:\n",
    "        kmax = 1000\n",
    "    k = 0\n",
    "    n = 2\n",
    "    while k < kmax:\n",
    "        i = 0\n",
    "        while i < k and n % p[i] != 0:\n",
    "            i = i + 1\n",
    "        if i == k:\n",
    "            p[k] = n\n",
    "            k = k + 1\n",
    "            result.append(n)\n",
    "        n = n + 1\n",
    "    return result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(cprimes(10))\n",
    "%time _ = cprimes(10000)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Defining C-style function"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%cython -a\n",
    "def cfib(n):\n",
    "    if n <= 1:\n",
    "        return 1\n",
    "    else:\n",
    "        return cfib(n-2) + cfib(n-1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%time cfib(33)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%time fib_jit(33)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%cython -a\n",
    "cdef int cfib(int n) nogil:\n",
    "    if n <= 1:\n",
    "        return 1\n",
    "    else:\n",
    "        return cfib(n-2) + cfib(n-1)\n",
    "\n",
    "def fib(n):\n",
    "    return cfib(n)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%time fib(33)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Cython in production code"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def root(a, b, c):\n",
    "    D = b ** 2 - 4 * a * c\n",
    "    if D < 0:\n",
    "        return np.nan, np.nan\n",
    "    x1 = (-b + np.sqrt(D)) / (2 * a)\n",
    "    x2 = (-b - np.sqrt(D)) / (2 * a)\n",
    "    return x1, x2\n",
    "root(1, 3, 4), root(-1, 3, 4), root(1, 0, 0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "a = np.random.randn(int(1e6))\n",
    "b = np.random.randn(int(1e6))\n",
    "c = np.random.randn(int(1e6))\n",
    "%time np.vectorize(root)(a, b, 7)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%time root_jit_vec(a, b, 7)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "%%cython -a\n",
    "from libc.math cimport sqrt, NAN\n",
    "\n",
    "def root(float a, float b, float c):\n",
    "    cdef float D, x1, x2\n",
    "    D = b ** 2 - 4 * a * c\n",
    "    if D < 0:\n",
    "        return NAN, NAN\n",
    "    x1 = (-b + sqrt(D)) / (2 * a)\n",
    "    x2 = (-b - sqrt(D)) / (2 * a)\n",
    "    return x1, x2\n",
    "print(root(1, 3, 4), root(-1, 3, 4), root(1, 0, 0))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## More cython"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "http://docs.cython.org/en/latest/src/tutorial/cython_tutorial.html"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Steps to optimizing code:\n",
    "- Profile where the bottleneck is (%prun)\n",
    "  - You can install a [line profiler](https://github.com/rkern/line_profiler) (available as %lprun after installation)\n",
    "- Is there a faster algorithm for the bottleneck?\n",
    "- If the bottleneck is vectorized: can we optimize with numexpr?\n",
    "- If the internal part of the loop can not be vectorized:\n",
    "    - numba jit if simple enough otherwise cython\n",
    "    - the loop itself can be sped up with numpy.vectorize\n",
    "- If the bottleneck gets called too often: cache the result\n",
    "- If even that fails: pycuda\n",
    "- If that is too slow: learn to optimize cuda code or find an easier problem"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.2"
  },
  "toc": {
   "colors": {
    "hover_highlight": "#DAA520",
    "running_highlight": "#FF0000",
    "selected_highlight": "#FFD700"
   },
   "moveMenuLeft": true,
   "nav_menu": {
    "height": "179px",
    "width": "252px"
   },
   "navigate_menu": true,
   "number_sections": true,
   "sideBar": true,
   "threshold": 4.0,
   "toc_cell": false,
   "toc_section_display": "block",
   "toc_window_display": true
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}