Skip to content
Snippets Groups Projects
Commit 20a10d7b authored by Paul McCarthy's avatar Paul McCarthy :mountain_bicyclist:
Browse files

Merge branch 'master' into 'master'

Updates to speeding up python talk

See merge request fsl/pytreat-2018-practicals!43
parents a9f7ad49 42e37522
No related branches found
No related tags found
No related merge requests found
%% Cell type:markdown id: tags:
# Numpy vectorizing
%% Cell type:code id: tags:
``` python
import numpy as np
```
%% Cell type:code id: tags:
``` python
def root(a, b, c):
D = b ** 2 - 4 * a * c
if D < 0:
return np.nan, np.nan
x1 = (-b + np.sqrt(D)) / (2 * a)
x2 = (-b - np.sqrt(D)) / (2 * a)
return x1, x2
root(1, 3, 4), root(-1, 3, 4), root(1, 0, 0)
```
%% Output
((nan, nan), (-1.0, 4.0), (0.0, 0.0))
%% Cell type:code id: tags:
``` python
a = np.random.randn(int(1e6))
b = np.random.randn(int(1e6))
c = np.random.randn(int(1e6))
root(a, b, c)
```
%% Output
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-19-5d1fed3ed2df> in <module>()
2 b = np.random.randn(int(1e6))
3 c = np.random.randn(int(1e6))
----> 4 root(a, b, c)
<ipython-input-18-54b500cd66b1> in root(a, b, c)
1 def root(a, b, c):
2 D = b ** 2 - 4 * a * c
----> 3 if D < 0:
4 return np.nan, np.nan
5 x1 = (-b + np.sqrt(D)) / (2 * a)
ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
%% Cell type:code id: tags:
``` python
%%time
np.array([root(av, bv, cv) for av, bv, cv in zip(a, b, c)])
```
%% Output
CPU times: user 3.64 s, sys: 65.9 ms, total: 3.71 s
Wall time: 3.71 s
array([[-0.55797188, 1.11121928],
[ nan, nan],
[ 0.76166857, -1.70637551],
...,
[-0.56859783, 0.98659831],
[ nan, nan],
[-0.53531175, 0.18339357]])
%% Cell type:code id: tags:
``` python
root_vec = np.vectorize(root)
np.vectorize?
```
%% Cell type:code id: tags:
``` python
%%time
root_vec(a, b, c)
```
%% Output
CPU times: user 1.85 s, sys: 71.9 ms, total: 1.92 s
Wall time: 1.92 s
(array([-0.55797188, nan, 0.76166857, ..., -0.56859783,
nan, -0.53531175]),
array([ 1.11121928, nan, -1.70637551, ..., 0.98659831,
nan, 0.18339357]))
%% Cell type:code id: tags:
``` python
%%time
root_vec(a, b, 7)
```
%% Output
CPU times: user 1.66 s, sys: 62.7 ms, total: 1.73 s
Wall time: 1.73 s
(array([-1.72246968, -2.63787563, nan, ..., -2.20722406,
nan, -1.84415698]),
array([2.27571708, 2.04353033, nan, ..., 2.62522454, nan,
1.4922388 ]))
%% Cell type:markdown id: tags:
# Numba
%% Cell type:code id: tags:
``` python
import numba
```
%% Cell type:code id: tags:
``` python
root_jit = numba.jit(root)
```
%% Cell type:code id: tags:
``` python
%timeit root(23, 78, 19.0)
```
%% Output
3.08 µs ± 192 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
%% Cell type:code id: tags:
``` python
%timeit root_jit(23, 78, 19.0)
```
%% Output
The slowest run took 11.09 times longer than the fastest. This could mean that an intermediate result is being cached.
786 ns ± 1.08 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)
%% Cell type:code id: tags:
``` python
%%time
root_jit(a, b, 7)
```
%% Output
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
The above exception was the direct cause of the following exception:
SystemError Traceback (most recent call last)
<timed eval> in <module>()
SystemError: CPUDispatcher(<function root at 0x114035620>) returned a result with an error set
%% Cell type:code id: tags:
``` python
root_jit_vec = np.vectorize(root_jit)
%time root_jit_vec(a, b, 7)
```
%% Output
CPU times: user 406 ms, sys: 57.2 ms, total: 464 ms
Wall time: 464 ms
(array([-1.72246968, -2.63787563, nan, ..., -2.20722406,
nan, -1.84415698]),
array([2.27571708, 2.04353033, nan, ..., 2.62522454, nan,
1.4922388 ]))
%% Cell type:code id: tags:
``` python
@np.vectorize
@numba.jit
def fast_root(a, b, c):
D = b ** 2 - 4 * a * c
if D < 0:
return np.nan, np.nan
x1 = (-b + np.sqrt(D)) / (2 * a)
x2 = (-b - np.sqrt(D)) / (2 * a)
return x1, x2
%time fast_root(a, b, 7)
```
%% Output
CPU times: user 391 ms, sys: 58.2 ms, total: 449 ms
Wall time: 448 ms
(array([-1.72246968, -2.63787563, nan, ..., -2.20722406,
nan, -1.84415698]),
array([2.27571708, 2.04353033, nan, ..., 2.62522454, nan,
1.4922388 ]))
%% Cell type:code id: tags:
``` python
@numba.vectorize
def fast_root(a, b, c):
D = b ** 2 - 4 * a * c
if D < 0:
return np.nan, np.nan
x1 = (-b + np.sqrt(D)) / (2 * a)
x2 = (-b - np.sqrt(D)) / (2 * a)
return x1, x2
%time fast_root(a, b, 7)
```
%% Output
---------------------------------------------------------------------------
NotImplementedError Traceback (most recent call last)
<timed eval> in <module>()
~/miniconda3/lib/python3.6/site-packages/numba/npyufunc/dufunc.py in _compile_for_args(self, *args, **kws)
166 argty = argty.dtype
167 argtys.append(argty)
--> 168 return self._compile_for_argtys(tuple(argtys))
169
170 def _compile_for_argtys(self, argtys, return_type=None):
~/miniconda3/lib/python3.6/site-packages/numba/npyufunc/dufunc.py in _compile_for_argtys(self, argtys, return_type)
188 cres, argtys, return_type)
189 dtypenums, ptr, env = ufuncbuilder._build_element_wise_ufunc_wrapper(
--> 190 cres, actual_sig)
191 self._add_loop(utils.longint(ptr), dtypenums)
192 self._keepalive.append((ptr, cres.library, env))
~/miniconda3/lib/python3.6/site-packages/numba/npyufunc/ufuncbuilder.py in _build_element_wise_ufunc_wrapper(cres, signature)
163 # Get dtypes
164 dtypenums = [as_dtype(a).num for a in signature.args]
--> 165 dtypenums.append(as_dtype(signature.return_type).num)
166 return dtypenums, ptr, env
167
~/miniconda3/lib/python3.6/site-packages/numba/numpy_support.py in as_dtype(nbtype)
134 return as_dtype(nbtype.dtype)
135 raise NotImplementedError("%r cannot be represented as a Numpy dtype"
--> 136 % (nbtype,))
137
138
NotImplementedError: (float64 x 2) cannot be represented as a Numpy dtype
%% Cell type:markdown id: tags:
# But run the profiler first
%% Cell type:code id: tags:
``` python
def fib(n):
if n <= 1:
return 1
else:
return fib(n-2) + fib(n-1)
%time fib(33)
```
%% Output
CPU times: user 1.56 s, sys: 3.5 ms, total: 1.56 s
Wall time: 1.56 s
5702887
%% Cell type:code id: tags:
``` python
@numba.jit
def fib_jit(n):
if n <= 1:
return 1
else:
return fib_jit(n-2) + fib_jit(n-1)
%time fib_jit(33)
```
%% Output
CPU times: user 63.4 ms, sys: 1.79 ms, total: 65.2 ms
Wall time: 64.2 ms
5702887
%% Cell type:code id: tags:
``` python
%time fib_jit(41)
```
%% Output
CPU times: user 1.28 s, sys: 5.26 ms, total: 1.28 s
Wall time: 1.28 s
267914296
%% Cell type:code id: tags:
``` python
%prun fib(33)
%prun fib_jit(41)
```
%% Output
%% Cell type:code id: tags:
``` python
%prun fib_jit(41)
%prun fib(33)
```
%% Output
%% Cell type:code id: tags:
``` python
from functools import lru_cache
@lru_cache(None)
def fib_cache(n):
if n <= 1:
return 1
else:
return fib_cache(n-2) + fib_cache(n-1)
%time fib_cache(33)
```
%% Output
CPU times: user 20 µs, sys: 1 µs, total: 21 µs
Wall time: 24.1 µs
%% Cell type:code id: tags:
``` python
lru_cache?
```
%% Cell type:code id: tags:
``` python
%time fib_cache(500)
```
%% Output
CPU times: user 261 µs, sys: 1e+03 ns, total: 262 µs
Wall time: 267 µs
225591516161936330872512695036072072046011324913758190588638866418474627738686883405015987052796968498626
%% Cell type:markdown id: tags:
# Optimizing numpy algebra
%% Cell type:markdown id: tags:
Multiplying/adding arrays incurs a lot of overhead, because lots of intermediate arrays get created and discarded again
%% Cell type:code id: tags:
``` python
a = np.random.randn(int(1e7))
b = np.random.randn(int(1e7))
%timeit a * b + 4.1 * a < 3 * b
```
%% Output
139 ms ± 1.16 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
%% Cell type:code id: tags:
``` python
import numexpr as ne
%timeit ne.evaluate('a * b + 4.1 * a < 3 * b')
```
%% Output
11.4 ms ± 484 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%% Cell type:markdown id: tags:
Efficiency gain reduces once the operations become more complex
%% Cell type:code id: tags:
``` python
%timeit a * np.sqrt(abs(b)) < 3 * b
c = np.random.randn(int(1e7))
%timeit (-b - np.sqrt(b ** 2 - 4 * a * c) ) / (2 * a)
```
%% Output
141 ms ± 1 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
%% Cell type:code id: tags:
``` python
%timeit ne.evaluate('a * sqrt(abs(b)) < 3 * b')
%timeit ne.evaluate('(-b - sqrt(b ** 2 - 4 * a * c) ) / (2 * a)')
```
%% Output
12.6 ms ± 986 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%% Cell type:markdown id: tags:
# Cython
%% Cell type:markdown id: tags:
http://docs.cython.org/en/latest/
Compiles most python programs to C code for extra speed (but less introspection). Also allows direct calling of C library code.
Can include python classes and much more.
%% Cell type:code id: tags:
``` python
%load_ext cython
```
%% Output
The cython extension is already loaded. To reload it, use:
%reload_ext cython
%% Cell type:code id: tags:
``` python
def primes(kmax):
p = {}
result = []
if kmax > 1000:
kmax = 1000
k = 0
n = 2
while k < kmax:
i = 0
while i < k and n % p[i] != 0:
i = i + 1
if i == k:
p[k] = n
k = k + 1
result.append(n)
n = n + 1
return result
print(primes(10))
%time _ = primes(10000)
```
%% Output
[2, 3, 5, 7, 11, 13, 17, 19, 23, 29]
CPU times: user 67.4 ms, sys: 1.22 ms, total: 68.6 ms
Wall time: 68 ms
%% Cell type:code id: tags:
``` python
%%cython -a
def cprimes(int kmax):
cdef int[1000] p
def cprimes(kmax):
p = {}
result = []
if kmax > 1000:
kmax = 1000
cdef int k, n, i
k = 0
n = 2
while k < kmax:
i = 0
while i < k and n % p[i] != 0:
i = i + 1
if i == k:
p[k] = n
k = k + 1
result.append(n)
n = n + 1
return result
```
%% Output
<IPython.core.display.HTML object>
%% Cell type:code id: tags:
``` python
print(cprimes(10))
%time _ = cprimes(10000)
```
%% Output
[2, 3, 5, 7, 11, 13, 17, 19, 23, 29]
CPU times: user 2.2 ms, sys: 10 µs, total: 2.21 ms
Wall time: 2.22 ms
%% Cell type:markdown id: tags:
## Defining C-style function
%% Cell type:code id: tags:
``` python
#%%cython -a
def fib(n):
%%cython -a
def cfib(n):
if n <= 1:
return 1
else:
return fib(n-2) + fib(n-1)
return cfib(n-2) + cfib(n-1)
```
%% Cell type:code id: tags:
``` python
%time fib(33)
%time cfib(33)
```
%% Output
CPU times: user 1.48 s, sys: 1.79 ms, total: 1.48 s
Wall time: 1.48 s
5702887
%% Cell type:code id: tags:
``` python
%time fib_jit(33)
```
%% Output
CPU times: user 27.5 ms, sys: 73 µs, total: 27.5 ms
Wall time: 27.5 ms
5702887
%% Cell type:code id: tags:
``` python
%%cython -a
cdef int cfib(int n) nogil:
if n <= 1:
return 1
else:
return cfib(n-2) + cfib(n-1)
def fib(n):
return cfib(n)
```
%% Output
<IPython.core.display.HTML object>
%% Cell type:code id: tags:
``` python
%time fib(33)
```
%% Output
CPU times: user 10.2 ms, sys: 138 µs, total: 10.4 ms
Wall time: 10.4 ms
5702887
%% Cell type:markdown id: tags:
## Cython in production code
%% Cell type:code id: tags:
``` python
def root(a, b, c):
D = b ** 2 - 4 * a * c
if D < 0:
return NAN, NAN
x1 = (-b + sqrt(D)) / (2 * a)
x2 = (-b - sqrt(D)) / (2 * a)
return np.nan, np.nan
x1 = (-b + np.sqrt(D)) / (2 * a)
x2 = (-b - np.sqrt(D)) / (2 * a)
return x1, x2
root(1, 3, 4), root(-1, 3, 4), root(1, 0, 0)
```
%% Cell type:code id: tags:
``` python
a = np.random.randn(int(1e6))
b = np.random.randn(int(1e6))
c = np.random.randn(int(1e6))
%time np.vectorize(root)(a, b, 7)
```
%% Output
CPU times: user 236 ms, sys: 53.7 ms, total: 290 ms
Wall time: 288 ms
(array([-1.72246969, -2.6378758 , nan, ..., -2.20722389,
nan, -1.84415698]),
array([2.27571702, 2.04353046, nan, ..., 2.62522459, nan,
1.49223876]))
%% Cell type:code id: tags:
``` python
%time root_jit_vec(a, b, 7)
```
%% Output
CPU times: user 353 ms, sys: 57.8 ms, total: 411 ms
Wall time: 410 ms
(array([-1.72246968, -2.63787563, nan, ..., -2.20722406,
nan, -1.84415698]),
array([2.27571708, 2.04353033, nan, ..., 2.62522454, nan,
1.4922388 ]))
%% Cell type:code id: tags:
``` python
%%cython -a
from libc.math cimport sqrt, NAN
def root(float a, float b, float c):
cdef float D, x1, x2
D = b ** 2 - 4 * a * c
if D < 0:
return NAN, NAN
x1 = (-b + sqrt(D)) / (2 * a)
x2 = (-b - sqrt(D)) / (2 * a)
return x1, x2
print(root(1, 3, 4), root(-1, 3, 4), root(1, 0, 0))
```
%% Cell type:markdown id: tags:
## Cython in production code
## More cython
%% Cell type:markdown id: tags:
http://docs.cython.org/en/latest/src/tutorial/cython_tutorial.html
%% Cell type:markdown id: tags:
# Steps to optimizing code:
- Profile where the bottleneck is (%prun)
- You can install a [line profiler](https://github.com/rkern/line_profiler) (available as %lprun after installation)
- Is there a faster algorithm for the bottleneck?
- If the bottleneck is vectorized: can we optimize with numexpr?
- If the internal part of the loop can not be vectorized:
- numba jit if simple enough otherwise cython
- the loop itself can be sped up with numpy.vectorize
- If the bottleneck gets called too often: cache the result
- If even that fails: pycuda
- If that is too slow: learn to optimize cuda code or find an easier problem
%% Cell type:code id: tags:
``` python
```
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment