diff --git a/advanced_programming/threading.ipynb b/advanced_programming/threading.ipynb index 69ec51b3309fd23189f21b829d7ce4ed5c5879f0..cb2f07b7adcaaf59bc736f532ca5e824f456e490 100644 --- a/advanced_programming/threading.ipynb +++ b/advanced_programming/threading.ipynb @@ -2,6 +2,7 @@ "cells": [ { "cell_type": "markdown", + "id": "215bdbaf", "metadata": {}, "source": [ "# Threading and parallel processing\n", @@ -15,6 +16,11 @@ "[`multiprocessing`](todo).\n", "\n", "\n", + "> *Note*: This notebook covers features that are built-in to the Python\n", + "> programming language. However, there are many other parallelisation options\n", + "> available to you through third-party libraries - some of them are covered in `applications/parallel/parallel.ipynb`.\n", + "\n", + "\n", "> *Note*: If you are familiar with a \"real\" programming language such as C++\n", "> or Java, you might be disappointed with the native support for parallelism in\n", "> Python. Python threads do not run in parallel because of the Global\n", @@ -61,6 +67,7 @@ { "cell_type": "code", "execution_count": null, + "id": "53f13f61", "metadata": {}, "outputs": [], "source": [ @@ -85,6 +92,7 @@ }, { "cell_type": "markdown", + "id": "859f5455", "metadata": {}, "source": [ "You can also `join` a thread, which will block execution in the current thread\n", @@ -94,6 +102,7 @@ { "cell_type": "code", "execution_count": null, + "id": "b039f5db", "metadata": {}, "outputs": [], "source": [ @@ -107,6 +116,7 @@ }, { "cell_type": "markdown", + "id": "2da49354", "metadata": {}, "source": [ "<a class=\"anchor\" id=\"subclassing-thread\"></a>\n", @@ -120,6 +130,7 @@ { "cell_type": "code", "execution_count": null, + "id": "7d248656", "metadata": {}, "outputs": [], "source": [ @@ -142,6 +153,7 @@ }, { "cell_type": "markdown", + "id": "1d90d56c", "metadata": {}, "source": [ "<a class=\"anchor\" id=\"daemon-threads\"></a>\n", @@ -162,6 +174,7 @@ { "cell_type": "code", "execution_count": null, + "id": "ceafbaac", "metadata": {}, "outputs": [], "source": [ @@ -171,6 +184,7 @@ }, { "cell_type": "markdown", + "id": "df69b8e4", "metadata": {}, "source": [ "See the [`Thread`\n", @@ -205,6 +219,7 @@ { "cell_type": "code", "execution_count": null, + "id": "25334a03", "metadata": {}, "outputs": [], "source": [ @@ -221,6 +236,7 @@ }, { "cell_type": "markdown", + "id": "6039c7a6", "metadata": {}, "source": [ "But if we protect the critical section with a `Lock` object, the output will\n", @@ -230,6 +246,7 @@ { "cell_type": "code", "execution_count": null, + "id": "f8b9b6ad", "metadata": {}, "outputs": [], "source": [ @@ -250,6 +267,7 @@ }, { "cell_type": "markdown", + "id": "52761f94", "metadata": {}, "source": [ "> Instead of using a `Lock` object in a `with` statement, it is also possible\n", @@ -273,6 +291,7 @@ { "cell_type": "code", "execution_count": null, + "id": "ede5bfb6", "metadata": {}, "outputs": [], "source": [ @@ -312,6 +331,7 @@ }, { "cell_type": "markdown", + "id": "dc3de638", "metadata": {}, "source": [ "Try removing the `mutex` lock from the two methods in the above code, and see\n", @@ -335,6 +355,7 @@ { "cell_type": "code", "execution_count": null, + "id": "4441bd44", "metadata": {}, "outputs": [], "source": [ @@ -359,6 +380,7 @@ }, { "cell_type": "markdown", + "id": "865b59c8", "metadata": {}, "source": [ "<a class=\"anchor\" id=\"the-global-interpreter-lock-gil\"></a>\n", @@ -479,6 +501,7 @@ { "cell_type": "code", "execution_count": null, + "id": "86a769fe", "metadata": {}, "outputs": [], "source": [ @@ -486,36 +509,21 @@ "import multiprocessing as mp\n", "import numpy as np\n", "\n", - "# We must avoid concurrent accesses to the\n", - "# print function when running parallel code\n", - "# within the Jupyter notebook environment.\n", - "# This would not be necessary when executing\n", - "# code normally (i.e. outside of Jupyter\n", - "# notebook)\n", - "lock = mp.Lock()\n", - "\n", "def crunchImage(imgfile):\n", "\n", - " # Load a nifti image, do stuff\n", - " # to it. Use your imagination\n", - " # to fill in this function.\n", + " # Load a nifti image and calculate some\n", + " # metric from the image. Use your\n", + " # imagination to fill in this function.\n", " time.sleep(2)\n", - "\n", - " # numpy's random number generator\n", - " # will be initialised in the same\n", - " # way in each process, so let's\n", - " # re-seed it.\n", " np.random.seed()\n", - " result = np.random.randint(1, 100, 1)\n", - "\n", - " with lock:\n", - " print(imgfile, ':', result)\n", + " result = np.random.randint(1, 100, 1)[0]\n", "\n", " return result\n", "\n", + "\n", "imgfiles = [f'{i:02d}.nii.gz' for i in range(20)]\n", "\n", - "print('Crunching images...')\n", + "print(f'Crunching {len(imgfiles)} images...')\n", "\n", "start = time.time()\n", "\n", @@ -524,11 +532,15 @@ "\n", "end = time.time()\n", "\n", + "for imgfile, result in zip(imgfiles, results):\n", + " print(f'Result for {imgfile}: {result}')\n", + "\n", "print('Total execution time: {:0.2f} seconds'.format(end - start))" ] }, { "cell_type": "markdown", + "id": "68cfea5c", "metadata": {}, "source": [ "The `Pool.map` method only works with functions that accept one argument, such\n", @@ -541,10 +553,10 @@ { "cell_type": "code", "execution_count": null, + "id": "9f249cde", "metadata": {}, "outputs": [], "source": [ - "lock = mp.Lock()\n", "def crunchImage(imgfile, modality):\n", " time.sleep(2)\n", "\n", @@ -555,10 +567,8 @@ " elif modality == 't2':\n", " result = np.random.randint(100, 200, 1)\n", "\n", - " with lock:\n", - " print(imgfile, ': ', result)\n", + " return result[0]\n", "\n", - " return result\n", "\n", "imgfiles = [f't1_{i:02d}.nii.gz' for i in range(10)] + \\\n", " [f't2_{i:02d}.nii.gz' for i in range(10)]\n", @@ -575,11 +585,15 @@ "\n", "end = time.time()\n", "\n", + "for imgfile, modality, result in zip(imgfiles, modalities, results):\n", + " print(f'{imgfile} [{modality}]: {result}')\n", + "\n", "print('Total execution time: {:0.2f} seconds'.format(end - start))" ] }, { "cell_type": "markdown", + "id": "b7cf7cb6", "metadata": {}, "source": [ "The `map` and `starmap` methods also have asynchronous equivalents `map_async`\n", @@ -610,6 +624,7 @@ { "cell_type": "code", "execution_count": null, + "id": "4dcee67f", "metadata": {}, "outputs": [], "source": [ @@ -620,9 +635,9 @@ "\n", "def linear_registration(src, ref):\n", " time.sleep(1)\n", - "\n", " return np.eye(4)\n", "\n", + "\n", "def nonlinear_registration(src, ref, affine):\n", "\n", " time.sleep(3)\n", @@ -630,13 +645,15 @@ " # this number represents a non-linear warp\n", " # field - use your imagination people!\n", " np.random.seed()\n", - " return np.random.randint(1, 100, 1)\n", + " return np.random.randint(1, 100, 1)[0]\n", + "\n", "\n", "t1s = [f'{i:02d}_t1.nii.gz' for i in range(20)]\n", "std = 'MNI152_T1_2mm.nii.gz'\n", "\n", "print('Running structural-to-standard registration '\n", - " 'on {} subjects...'.format(len(t1s)))\n", + " f'on {len(t1s)} subjects...')\n", + "\n", "\n", "# Run linear registration on all the T1s.\n", "start = time.time()\n", @@ -653,10 +670,12 @@ " for i, r in enumerate(linresults):\n", " linresults[i] = r.get()\n", "\n", + "\n", "end = time.time()\n", "\n", "print('Linear registrations completed in '\n", - " '{:0.2f} seconds'.format(end - start))\n", + " f'{end - start:0.2f} seconds')\n", + "\n", "\n", "# Run non-linear registration on all the T1s,\n", "# using the linear registrations to initialise.\n", @@ -670,6 +689,7 @@ " for i, r in enumerate(nlinresults):\n", " nlinresults[i] = r.get()\n", "\n", + "\n", "end = time.time()\n", "\n", "print('Non-linear registrations completed in '\n", @@ -677,11 +697,12 @@ "\n", "print('Non linear registrations:')\n", "for t1, result in zip(t1s, nlinresults):\n", - " print(t1, ':', result)" + " print(f'{t1} : {result}')" ] }, { "cell_type": "markdown", + "id": "0ab5d9e2", "metadata": {}, "source": [ "<a class=\"anchor\" id=\"sharing-data-between-processes\"></a>\n", @@ -707,7 +728,9 @@ "\n", "> <sup>1</sup>*Pickleable* is the term used in the Python world to refer to\n", "> something that is *serialisable* - basically, the process of converting an\n", - "> in-memory object into a binary form that can be stored and/or transmitted.\n", + "> in-memory object into a binary form that can be stored and/or transmitted,\n", + "> and then loaded back into memory at some point in the future (in the same\n", + "> process, or in another process).\n", "\n", "\n", "There is obviously some overhead in copying data back and forth between the\n", @@ -728,7 +751,7 @@ "\n", "\n", "This is because, when you create a `Pool`, what actually happens is that the\n", - "process your Pythonn script is running in will [**fork**][wiki-fork] itself -\n", + "process your Python script is running in will [**fork**][wiki-fork] itself -\n", "the child processes that are created are used as the worker processes by the\n", "`Pool`. And if you create/load your data in your main process *before* this\n", "fork occurs, all of the child processes will inherit the memory space of the\n", @@ -751,6 +774,7 @@ { "cell_type": "code", "execution_count": null, + "id": "af8db9e4", "metadata": {}, "outputs": [], "source": [ @@ -776,6 +800,7 @@ }, { "cell_type": "markdown", + "id": "7ec2c3c7", "metadata": {}, "source": [ "Now our task is simply to calculate the sum of a large array of numbers. We're\n", @@ -786,6 +811,7 @@ { "cell_type": "code", "execution_count": null, + "id": "0e872d7f", "metadata": {}, "outputs": [], "source": [ @@ -835,6 +861,7 @@ }, { "cell_type": "markdown", + "id": "026e1cd3", "metadata": {}, "source": [ "You should be able to see that only one copy of `data` is created, and is\n", @@ -922,6 +949,7 @@ { "cell_type": "code", "execution_count": null, + "id": "a04bc5de", "metadata": {}, "outputs": [], "source": [ @@ -949,6 +977,7 @@ }, { "cell_type": "markdown", + "id": "518073ed", "metadata": {}, "source": [ "Rather than passing the input and output data arrays in as arguments to the\n", @@ -974,6 +1003,7 @@ { "cell_type": "code", "execution_count": null, + "id": "349cb770", "metadata": {}, "outputs": [], "source": [ @@ -1044,6 +1074,7 @@ }, { "cell_type": "markdown", + "id": "0abbf164", "metadata": {}, "source": [ "Now we can call our `process_data` function just like any other function:" @@ -1052,6 +1083,7 @@ { "cell_type": "code", "execution_count": null, + "id": "ccc4ea77", "metadata": {}, "outputs": [], "source": [ @@ -1068,5 +1100,5 @@ ], "metadata": {}, "nbformat": 4, - "nbformat_minor": 4 + "nbformat_minor": 5 } diff --git a/advanced_programming/threading.md b/advanced_programming/threading.md index a6e045315658ab6b2b3ff893bcf36a628a6f207c..63cccd3eebc1ea30ea1ae02b1769074c607215bb 100644 --- a/advanced_programming/threading.md +++ b/advanced_programming/threading.md @@ -9,6 +9,11 @@ module. If you want to be impressed, skip straight to the section on [`multiprocessing`](todo). +> *Note*: This notebook covers features that are built-in to the Python +> programming language. However, there are many other parallelisation options +> available to you through third-party libraries - some of them are covered in `applications/parallel/parallel.ipynb`. + + > *Note*: If you are familiar with a "real" programming language such as C++ > or Java, you might be disappointed with the native support for parallelism in > Python. Python threads do not run in parallel because of the Global @@ -411,36 +416,21 @@ import time import multiprocessing as mp import numpy as np -# We must avoid concurrent accesses to the -# print function when running parallel code -# within the Jupyter notebook environment. -# This would not be necessary when executing -# code normally (i.e. outside of Jupyter -# notebook) -lock = mp.Lock() - def crunchImage(imgfile): - # Load a nifti image, do stuff - # to it. Use your imagination - # to fill in this function. + # Load a nifti image and calculate some + # metric from the image. Use your + # imagination to fill in this function. time.sleep(2) - - # numpy's random number generator - # will be initialised in the same - # way in each process, so let's - # re-seed it. np.random.seed() - result = np.random.randint(1, 100, 1) - - with lock: - print(imgfile, ':', result) + result = np.random.randint(1, 100, 1)[0] return result + imgfiles = [f'{i:02d}.nii.gz' for i in range(20)] -print('Crunching images...') +print(f'Crunching {len(imgfiles)} images...') start = time.time() @@ -449,6 +439,9 @@ with mp.Pool(processes=16) as p: end = time.time() +for imgfile, result in zip(imgfiles, results): + print(f'Result for {imgfile}: {result}') + print('Total execution time: {:0.2f} seconds'.format(end - start)) ``` @@ -461,7 +454,6 @@ method instead: ``` -lock = mp.Lock() def crunchImage(imgfile, modality): time.sleep(2) @@ -472,10 +464,8 @@ def crunchImage(imgfile, modality): elif modality == 't2': result = np.random.randint(100, 200, 1) - with lock: - print(imgfile, ': ', result) + return result[0] - return result imgfiles = [f't1_{i:02d}.nii.gz' for i in range(10)] + \ [f't2_{i:02d}.nii.gz' for i in range(10)] @@ -492,6 +482,9 @@ with mp.Pool(processes=16) as pool: end = time.time() +for imgfile, modality, result in zip(imgfiles, modalities, results): + print(f'{imgfile} [{modality}]: {result}') + print('Total execution time: {:0.2f} seconds'.format(end - start)) ``` @@ -529,9 +522,9 @@ import numpy as np def linear_registration(src, ref): time.sleep(1) - return np.eye(4) + def nonlinear_registration(src, ref, affine): time.sleep(3) @@ -539,13 +532,15 @@ def nonlinear_registration(src, ref, affine): # this number represents a non-linear warp # field - use your imagination people! np.random.seed() - return np.random.randint(1, 100, 1) + return np.random.randint(1, 100, 1)[0] + t1s = [f'{i:02d}_t1.nii.gz' for i in range(20)] std = 'MNI152_T1_2mm.nii.gz' print('Running structural-to-standard registration ' - 'on {} subjects...'.format(len(t1s))) + f'on {len(t1s)} subjects...') + # Run linear registration on all the T1s. start = time.time() @@ -562,10 +557,12 @@ with mp.Pool(processes=16) as pool: for i, r in enumerate(linresults): linresults[i] = r.get() + end = time.time() print('Linear registrations completed in ' - '{:0.2f} seconds'.format(end - start)) + f'{end - start:0.2f} seconds') + # Run non-linear registration on all the T1s, # using the linear registrations to initialise. @@ -579,6 +576,7 @@ with mp.Pool(processes=16) as pool: for i, r in enumerate(nlinresults): nlinresults[i] = r.get() + end = time.time() print('Non-linear registrations completed in ' @@ -586,7 +584,7 @@ print('Non-linear registrations completed in ' print('Non linear registrations:') for t1, result in zip(t1s, nlinresults): - print(t1, ':', result) + print(f'{t1} : {result}') ``` @@ -613,7 +611,9 @@ third-party library). > <sup>1</sup>*Pickleable* is the term used in the Python world to refer to > something that is *serialisable* - basically, the process of converting an -> in-memory object into a binary form that can be stored and/or transmitted. +> in-memory object into a binary form that can be stored and/or transmitted, +> and then loaded back into memory at some point in the future (in the same +> process, or in another process). There is obviously some overhead in copying data back and forth between the @@ -634,7 +634,7 @@ the processes, you will need to: This is because, when you create a `Pool`, what actually happens is that the -process your Pythonn script is running in will [**fork**][wiki-fork] itself - +process your Python script is running in will [**fork**][wiki-fork] itself - the child processes that are created are used as the worker processes by the `Pool`. And if you create/load your data in your main process *before* this fork occurs, all of the child processes will inherit the memory space of the