Skip to content
Snippets Groups Projects
Commit 42e37522 authored by Michiel Cottaar's avatar Michiel Cottaar
Browse files

Updates to speeding up python talk

parent d18421fb
No related branches found
No related tags found
No related merge requests found
%% Cell type:markdown id: tags:
# Numpy vectorizing
%% Cell type:code id: tags:
``` python
import numpy as np
```
%% Cell type:code id: tags:
``` python
def root(a, b, c):
D = b ** 2 - 4 * a * c
if D < 0:
return np.nan, np.nan
x1 = (-b + np.sqrt(D)) / (2 * a)
x2 = (-b - np.sqrt(D)) / (2 * a)
return x1, x2
root(1, 3, 4), root(-1, 3, 4), root(1, 0, 0)
```
%% Output
((nan, nan), (-1.0, 4.0), (0.0, 0.0))
%% Cell type:code id: tags:
``` python
a = np.random.randn(int(1e6))
b = np.random.randn(int(1e6))
c = np.random.randn(int(1e6))
root(a, b, c)
```
%% Output
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-19-5d1fed3ed2df> in <module>()
2 b = np.random.randn(int(1e6))
3 c = np.random.randn(int(1e6))
----> 4 root(a, b, c)
<ipython-input-18-54b500cd66b1> in root(a, b, c)
1 def root(a, b, c):
2 D = b ** 2 - 4 * a * c
----> 3 if D < 0:
4 return np.nan, np.nan
5 x1 = (-b + np.sqrt(D)) / (2 * a)
ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
%% Cell type:code id: tags:
``` python
%%time
np.array([root(av, bv, cv) for av, bv, cv in zip(a, b, c)])
```
%% Output
CPU times: user 3.64 s, sys: 65.9 ms, total: 3.71 s
Wall time: 3.71 s
array([[-0.55797188, 1.11121928],
[ nan, nan],
[ 0.76166857, -1.70637551],
...,
[-0.56859783, 0.98659831],
[ nan, nan],
[-0.53531175, 0.18339357]])
%% Cell type:code id: tags:
``` python
root_vec = np.vectorize(root)
np.vectorize?
```
%% Cell type:code id: tags:
``` python
%%time
root_vec(a, b, c)
```
%% Output
CPU times: user 1.85 s, sys: 71.9 ms, total: 1.92 s
Wall time: 1.92 s
(array([-0.55797188, nan, 0.76166857, ..., -0.56859783,
nan, -0.53531175]),
array([ 1.11121928, nan, -1.70637551, ..., 0.98659831,
nan, 0.18339357]))
%% Cell type:code id: tags:
``` python
%%time
root_vec(a, b, 7)
```
%% Output
CPU times: user 1.66 s, sys: 62.7 ms, total: 1.73 s
Wall time: 1.73 s
(array([-1.72246968, -2.63787563, nan, ..., -2.20722406,
nan, -1.84415698]),
array([2.27571708, 2.04353033, nan, ..., 2.62522454, nan,
1.4922388 ]))
%% Cell type:markdown id: tags:
# Numba
%% Cell type:code id: tags:
``` python
import numba
```
%% Cell type:code id: tags:
``` python
root_jit = numba.jit(root)
```
%% Cell type:code id: tags:
``` python
%timeit root(23, 78, 19.0)
```
%% Output
3.08 µs ± 192 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
%% Cell type:code id: tags:
``` python
%timeit root_jit(23, 78, 19.0)
```
%% Output
The slowest run took 11.09 times longer than the fastest. This could mean that an intermediate result is being cached.
786 ns ± 1.08 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)
%% Cell type:code id: tags:
``` python
%%time
root_jit(a, b, 7)
```
%% Output
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
The above exception was the direct cause of the following exception:
SystemError Traceback (most recent call last)
<timed eval> in <module>()
SystemError: CPUDispatcher(<function root at 0x114035620>) returned a result with an error set
%% Cell type:code id: tags:
``` python
root_jit_vec = np.vectorize(root_jit)
%time root_jit_vec(a, b, 7)
```
%% Output
CPU times: user 406 ms, sys: 57.2 ms, total: 464 ms
Wall time: 464 ms
(array([-1.72246968, -2.63787563, nan, ..., -2.20722406,
nan, -1.84415698]),
array([2.27571708, 2.04353033, nan, ..., 2.62522454, nan,
1.4922388 ]))
%% Cell type:code id: tags:
``` python
@np.vectorize
@numba.jit
def fast_root(a, b, c):
D = b ** 2 - 4 * a * c
if D < 0:
return np.nan, np.nan
x1 = (-b + np.sqrt(D)) / (2 * a)
x2 = (-b - np.sqrt(D)) / (2 * a)
return x1, x2
%time fast_root(a, b, 7)
```
%% Output
CPU times: user 391 ms, sys: 58.2 ms, total: 449 ms
Wall time: 448 ms
(array([-1.72246968, -2.63787563, nan, ..., -2.20722406,
nan, -1.84415698]),
array([2.27571708, 2.04353033, nan, ..., 2.62522454, nan,
1.4922388 ]))
%% Cell type:code id: tags:
``` python
@numba.vectorize
def fast_root(a, b, c):
D = b ** 2 - 4 * a * c
if D < 0:
return np.nan, np.nan
x1 = (-b + np.sqrt(D)) / (2 * a)
x2 = (-b - np.sqrt(D)) / (2 * a)
return x1, x2
%time fast_root(a, b, 7)
```
%% Output
---------------------------------------------------------------------------
NotImplementedError Traceback (most recent call last)
<timed eval> in <module>()
~/miniconda3/lib/python3.6/site-packages/numba/npyufunc/dufunc.py in _compile_for_args(self, *args, **kws)
166 argty = argty.dtype
167 argtys.append(argty)
--> 168 return self._compile_for_argtys(tuple(argtys))
169
170 def _compile_for_argtys(self, argtys, return_type=None):
~/miniconda3/lib/python3.6/site-packages/numba/npyufunc/dufunc.py in _compile_for_argtys(self, argtys, return_type)
188 cres, argtys, return_type)
189 dtypenums, ptr, env = ufuncbuilder._build_element_wise_ufunc_wrapper(
--> 190 cres, actual_sig)
191 self._add_loop(utils.longint(ptr), dtypenums)
192 self._keepalive.append((ptr, cres.library, env))
~/miniconda3/lib/python3.6/site-packages/numba/npyufunc/ufuncbuilder.py in _build_element_wise_ufunc_wrapper(cres, signature)
163 # Get dtypes
164 dtypenums = [as_dtype(a).num for a in signature.args]
--> 165 dtypenums.append(as_dtype(signature.return_type).num)
166 return dtypenums, ptr, env
167
~/miniconda3/lib/python3.6/site-packages/numba/numpy_support.py in as_dtype(nbtype)
134 return as_dtype(nbtype.dtype)
135 raise NotImplementedError("%r cannot be represented as a Numpy dtype"
--> 136 % (nbtype,))
137
138
NotImplementedError: (float64 x 2) cannot be represented as a Numpy dtype
%% Cell type:markdown id: tags:
# But run the profiler first
%% Cell type:code id: tags:
``` python
def fib(n):
if n <= 1:
return 1
else:
return fib(n-2) + fib(n-1)
%time fib(33)
```
%% Output
CPU times: user 1.56 s, sys: 3.5 ms, total: 1.56 s
Wall time: 1.56 s
5702887
%% Cell type:code id: tags:
``` python
@numba.jit
def fib_jit(n):
if n <= 1:
return 1
else:
return fib_jit(n-2) + fib_jit(n-1)
%time fib_jit(33)
```
%% Output
CPU times: user 63.4 ms, sys: 1.79 ms, total: 65.2 ms
Wall time: 64.2 ms
5702887
%% Cell type:code id: tags:
``` python
%time fib_jit(41)
```
%% Output
CPU times: user 1.28 s, sys: 5.26 ms, total: 1.28 s
Wall time: 1.28 s
267914296
%% Cell type:code id: tags:
``` python
%prun fib(33)
%prun fib_jit(41)
```
%% Output
%% Cell type:code id: tags:
``` python
%prun fib_jit(41)
%prun fib(33)
```
%% Output
%% Cell type:code id: tags:
``` python
from functools import lru_cache
@lru_cache(None)
def fib_cache(n):
if n <= 1:
return 1
else:
return fib_cache(n-2) + fib_cache(n-1)
%time fib_cache(33)
```
%% Output
CPU times: user 20 µs, sys: 1 µs, total: 21 µs
Wall time: 24.1 µs
%% Cell type:code id: tags:
``` python
lru_cache?
```
%% Cell type:code id: tags:
``` python
%time fib_cache(500)
```
%% Output
CPU times: user 261 µs, sys: 1e+03 ns, total: 262 µs
Wall time: 267 µs
225591516161936330872512695036072072046011324913758190588638866418474627738686883405015987052796968498626
%% Cell type:markdown id: tags:
# Optimizing numpy algebra
%% Cell type:markdown id: tags:
Multiplying/adding arrays incurs a lot of overhead, because lots of intermediate arrays get created and discarded again
%% Cell type:code id: tags:
``` python
a = np.random.randn(int(1e7))
b = np.random.randn(int(1e7))
%timeit a * b + 4.1 * a < 3 * b
```
%% Output
139 ms ± 1.16 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
%% Cell type:code id: tags:
``` python
import numexpr as ne
%timeit ne.evaluate('a * b + 4.1 * a < 3 * b')
```
%% Output
11.4 ms ± 484 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%% Cell type:markdown id: tags:
Efficiency gain reduces once the operations become more complex
%% Cell type:code id: tags:
``` python
%timeit a * np.sqrt(abs(b)) < 3 * b
c = np.random.randn(int(1e7))
%timeit (-b - np.sqrt(b ** 2 - 4 * a * c) ) / (2 * a)
```
%% Output
141 ms ± 1 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
%% Cell type:code id: tags:
``` python
%timeit ne.evaluate('a * sqrt(abs(b)) < 3 * b')
%timeit ne.evaluate('(-b - sqrt(b ** 2 - 4 * a * c) ) / (2 * a)')
```
%% Output
12.6 ms ± 986 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%% Cell type:markdown id: tags:
# Cython
%% Cell type:markdown id: tags:
http://docs.cython.org/en/latest/
Compiles most python programs to C code for extra speed (but less introspection). Also allows direct calling of C library code.
Can include python classes and much more.
%% Cell type:code id: tags:
``` python
%load_ext cython
```
%% Output
The cython extension is already loaded. To reload it, use:
%reload_ext cython
%% Cell type:code id: tags:
``` python
def primes(kmax):
p = {}
result = []
if kmax > 1000:
kmax = 1000
k = 0
n = 2
while k < kmax:
i = 0
while i < k and n % p[i] != 0:
i = i + 1
if i == k:
p[k] = n
k = k + 1
result.append(n)
n = n + 1
return result
print(primes(10))
%time _ = primes(10000)
```
%% Output
[2, 3, 5, 7, 11, 13, 17, 19, 23, 29]
CPU times: user 67.4 ms, sys: 1.22 ms, total: 68.6 ms
Wall time: 68 ms
%% Cell type:code id: tags:
``` python
%%cython -a
def cprimes(int kmax):
cdef int[1000] p
def cprimes(kmax):
p = {}
result = []
if kmax > 1000:
kmax = 1000
cdef int k, n, i
k = 0
n = 2
while k < kmax:
i = 0
while i < k and n % p[i] != 0:
i = i + 1
if i == k:
p[k] = n
k = k + 1
result.append(n)
n = n + 1
return result
```
%% Output
<IPython.core.display.HTML object>
%% Cell type:code id: tags:
``` python
print(cprimes(10))
%time _ = cprimes(10000)
```
%% Output
[2, 3, 5, 7, 11, 13, 17, 19, 23, 29]
CPU times: user 2.2 ms, sys: 10 µs, total: 2.21 ms
Wall time: 2.22 ms
%% Cell type:markdown id: tags:
## Defining C-style function
%% Cell type:code id: tags:
``` python
#%%cython -a
def fib(n):
%%cython -a
def cfib(n):
if n <= 1:
return 1
else:
return fib(n-2) + fib(n-1)
return cfib(n-2) + cfib(n-1)
```
%% Cell type:code id: tags:
``` python
%time fib(33)
%time cfib(33)
```
%% Output
CPU times: user 1.48 s, sys: 1.79 ms, total: 1.48 s
Wall time: 1.48 s
5702887
%% Cell type:code id: tags:
``` python
%time fib_jit(33)
```
%% Output
CPU times: user 27.5 ms, sys: 73 µs, total: 27.5 ms
Wall time: 27.5 ms
5702887
%% Cell type:code id: tags:
``` python
%%cython -a
cdef int cfib(int n) nogil:
if n <= 1:
return 1
else:
return cfib(n-2) + cfib(n-1)
def fib(n):
return cfib(n)
```
%% Output
<IPython.core.display.HTML object>
%% Cell type:code id: tags:
``` python
%time fib(33)
```
%% Output
CPU times: user 10.2 ms, sys: 138 µs, total: 10.4 ms
Wall time: 10.4 ms
5702887
%% Cell type:markdown id: tags:
## Cython in production code
%% Cell type:code id: tags:
``` python
def root(a, b, c):
D = b ** 2 - 4 * a * c
if D < 0:
return NAN, NAN
x1 = (-b + sqrt(D)) / (2 * a)
x2 = (-b - sqrt(D)) / (2 * a)
return np.nan, np.nan
x1 = (-b + np.sqrt(D)) / (2 * a)
x2 = (-b - np.sqrt(D)) / (2 * a)
return x1, x2
root(1, 3, 4), root(-1, 3, 4), root(1, 0, 0)
```
%% Cell type:code id: tags:
``` python
a = np.random.randn(int(1e6))
b = np.random.randn(int(1e6))
c = np.random.randn(int(1e6))
%time np.vectorize(root)(a, b, 7)
```
%% Output
CPU times: user 236 ms, sys: 53.7 ms, total: 290 ms
Wall time: 288 ms
(array([-1.72246969, -2.6378758 , nan, ..., -2.20722389,
nan, -1.84415698]),
array([2.27571702, 2.04353046, nan, ..., 2.62522459, nan,
1.49223876]))
%% Cell type:code id: tags:
``` python
%time root_jit_vec(a, b, 7)
```
%% Output
CPU times: user 353 ms, sys: 57.8 ms, total: 411 ms
Wall time: 410 ms
(array([-1.72246968, -2.63787563, nan, ..., -2.20722406,
nan, -1.84415698]),
array([2.27571708, 2.04353033, nan, ..., 2.62522454, nan,
1.4922388 ]))
%% Cell type:code id: tags:
``` python
%%cython -a
from libc.math cimport sqrt, NAN
def root(float a, float b, float c):
cdef float D, x1, x2
D = b ** 2 - 4 * a * c
if D < 0:
return NAN, NAN
x1 = (-b + sqrt(D)) / (2 * a)
x2 = (-b - sqrt(D)) / (2 * a)
return x1, x2
print(root(1, 3, 4), root(-1, 3, 4), root(1, 0, 0))
```
%% Cell type:markdown id: tags:
## Cython in production code
## More cython
%% Cell type:markdown id: tags:
http://docs.cython.org/en/latest/src/tutorial/cython_tutorial.html
%% Cell type:markdown id: tags:
# Steps to optimizing code:
- Profile where the bottleneck is (%prun)
- You can install a [line profiler](https://github.com/rkern/line_profiler) (available as %lprun after installation)
- Is there a faster algorithm for the bottleneck?
- If the bottleneck is vectorized: can we optimize with numexpr?
- If the internal part of the loop can not be vectorized:
- numba jit if simple enough otherwise cython
- the loop itself can be sped up with numpy.vectorize
- If the bottleneck gets called too often: cache the result
- If even that fails: pycuda
- If that is too slow: learn to optimize cuda code or find an easier problem
%% Cell type:code id: tags:
``` python
```
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment