diff --git a/pyfeeds/common.py b/pyfeeds/common.py
index 9711f5e3b15b98a2aa673a94e443ccea3aab17c6..33a49f7d807a8d7954cd783993e00d8b9e08f440 100644
--- a/pyfeeds/common.py
+++ b/pyfeeds/common.py
@@ -23,6 +23,7 @@ pyfeeds, and made available to test scripts.
 
 
 import            collections
+import            contextlib
 import            datetime
 import            tempfile
 import            logging
@@ -36,6 +37,23 @@ import numpy   as np
 log = logging.getLogger(__name__)
 
 
+@contextlib.contextmanager
+def tempdir():
+    """Context manager which creates and changes into a temporary directory,
+    On exit, the directory is deleted and the current directory restored.
+    """
+
+    prevdir = os.getcwd()
+
+    with tempfile.TemporaryDirectory(delete=True) as td:
+        try:
+            os.chdir(testdir)
+            yield td.name
+
+        finally:
+            os.chdir(prevdir)
+
+
 def printColumns(titles, columns):
     """Convenience function which pretty-prints a collection of columns in a
     tabular format.
diff --git a/pyfeeds/evaluate.py b/pyfeeds/evaluate.py
index 56dec2f1a86bcee73c71adef78e7a4dd2813d6dc..3f9bc3c19837ebfc79c6586d9b9d998020cc1578 100644
--- a/pyfeeds/evaluate.py
+++ b/pyfeeds/evaluate.py
@@ -13,8 +13,15 @@ identified by their name - any function with a name that begins in ``eval``,
 but not ``evaluate``, is considered to be an evaluation routine.
 
 
-All evaluation routines must accept two parameters, which are the names of two
-files to be compared.
+All evaluation routines must accept two positional arguments:
+ - Path to the file to test
+ - Path to the benchmark file to compare against
+
+And at least one keyword argument called ``pyf``, which may be ``None``. When
+an evaluation routine is called by ``pyfeeds``, this will be a
+:class:`.Pyfeeds` context object which provides access to the
+:class:`.ImageCache`. But when called by Python-based ``feedsRun`` scripts
+which evaluate their own results, this can be set to ``None``.
 
 
 Evaluation routines ending with the word ``Group`` are routines which accept
@@ -23,6 +30,7 @@ two groups of files, and evaluate the groups as a whole.
 
 All evaluation routines must return a numeric value, where 0 indicates that
 the files are identical, and non-0 indicates that the files are different.
+
 """
 
 
@@ -148,7 +156,7 @@ def getEvaluationRoutines(pyf, filename):
     for pattern, rnames in pyf.evalRoutines.items():
         if fnmatch.fnmatch(filename, pattern):
             rfuncs           = [getattr(thismod, n) for n in rnames]
-            rfuncs           = [ft.update_wrapper(ft.partial(f, pyf), f)
+            rfuncs           = [ft.update_wrapper(ft.partial(f, pyf=pyf), f)
                                 for f in rfuncs]
             routines         = list(zip(rnames, rfuncs))
             matches[pattern] = routines
@@ -391,15 +399,36 @@ def compareAllFiles(pyf, test, outputDir, benchmarkDir, logFile=None):
     return finalResult
 
 
-def evalHeader(pyf, testfile, benchmark, alldims=True):
+def loadImage(pyf, filename):
+    """Used by the ``eval*`` functions. If running within ``pyfeeds`` and the
+    ``Pyfeeds`` context object ``pyf`` is not None, loads ``filename`` via the
+    :class:`.ImageCache`. Otherwise, if ``pyf is None``, loads ``filename``
+    using ``nibabel``.
+
+    Returns a tuple containing the ``nibabel`` image, and a ``numpy``array
+    with the image data. If ``filename`` is not a NIfTI image, the second
+    returned value will be ``None``.
+    """
+    if pyf is not None:
+        return pyf.imageCache[filename]
+    else:
+        img = nib.load(filename)
+        if isinstance(image, nib.Nifti1Image):
+            data = np.asanyarray(image.dataobj)
+        else:
+            data = None
+        return img, data
+
+
+def evalHeader(testfile, benchmark, alldims=True, pyf=None):
     """Evaluation routine which compares the header fields of two NIFTI
     images.
 
     Returns 0 if they all match, 1 otherwise.
     """
 
-    img1   = pyf.imageCache[testfile][0]
-    img2   = pyf.imageCache[benchmark][0]
+    img1   = loadImage(testfile)[0]
+    img2   = loadImage(benchmark)[0]
     hdr1   = img1.header
     hdr2   = img2.header
     fields = ['dim',       'pixdim',     'intent_code',
@@ -423,7 +452,7 @@ def evalHeader(pyf, testfile, benchmark, alldims=True):
     return 0
 
 
-def evalHeaderRestrictDims(pyf, testfile, benchmark):
+def evalHeaderRestrictDims(testfile, benchmark, pyf=None):
     """Evaluation routine which compares the header fields of two NIFTI
     images. For the `dim` and `pixdim` fields, only the entries which
     are expected to be valid (e.g. `dim1`, `dim2`, and `dim3` for a 3D image)
@@ -431,33 +460,33 @@ def evalHeaderRestrictDims(pyf, testfile, benchmark):
 
     Returns 0 if they all match, 1 otherwise.
     """
-    return evalHeader(pyf, testfile, benchmark, alldims=False)
+    return evalHeader(testfile, benchmark, alldims=False, pyf=pyf)
 
 
-def evalImage(pyf, testfile, benchmark):
+def evalImage(testfile, benchmark, pyf=None):
     """Evaluation routine which compares the data from two NIFTI images.
     Returns the mean difference between the two images, normalised by
     the combined data range.
 
     The :func:`cmpArrays` function does the calculation.
     """
-    data1  = pyf.imageCache[testfile][1]
-    data2  = pyf.imageCache[benchmark][1]
+    data1 = loadImage(pyf, testfile)[1]
+    data2 = loadImage(pyf, benchmark)[1]
     return cmpArrays(data1, data2, metric='mean')
 
 
-def evalImageMaxDiff(pyf, testfile, benchmark):
+def evalImageMaxDiff(testfile, benchmark, pyf=None):
     """Evaluation routine which compares the data from two NIFTI images.
     Returns the maximum absolute difference between the two images.
 
     The :func:`cmpArrays` function does the calculation.
     """
-    data1  = pyf.imageCache[testfile][1]
-    data2  = pyf.imageCache[benchmark][1]
+    data1 = loadImage(pyf, testfile)[1]
+    data2 = loadImage(pyf, benchmark)[1]
     return cmpArrays(data1, data2, metric='max')
 
 
-def evalNumericalText(pyf, testfile, benchmark):
+def evalNumericalText(testfile, benchmark, pyf=None):
     """Evaluation routine which compares the numerical data from two text
     files.
 
@@ -470,7 +499,7 @@ def evalNumericalText(pyf, testfile, benchmark):
     return cmpArrays(data1, data2)
 
 
-def evalMD5(pyf, testfile, benchmark):
+def evalMD5(testfile, benchmark, pyf=None):
     """Compares the given files by calculating their MD5 hash. Returns
     0 if the hashes match, 1 otherwise.
     """
@@ -482,19 +511,19 @@ def evalMD5(pyf, testfile, benchmark):
     else:              return 0
 
 
-def evalVectorImage(pyf, testfile, benchmark):
+def evalVectorImage(testfile, benchmark, pyf=None):
     """Compares the given NIFTI images. It is assumed that they are
     both vector images, where each voxel contains a 3-dimensional
     vector, undirected, and centered at 0.
     """
 
-    data1 = pyf.imageCache[testfile][ 1]
-    data2 = pyf.imageCache[benchmark][1]
+    data1 = loadImage(pyf, testfile)[ 1]
+    data2 = loadImage(pyf, benchmark)[1]
 
     return cmpVectorArrays(data1, data2)
 
 
-def evalPolarCoordinateImageGroup(pyf, testfiles, benchmarks):
+def evalPolarCoordinateImageGroup(testfiles, benchmarks, pyf=None):
     """Compares the values in a set of files containing 3D polar coordinates.
 
     Currently only the ``phi`` and ``theta`` outputs of ``bedpostx`` are
@@ -531,10 +560,10 @@ def evalPolarCoordinateImageGroup(pyf, testfiles, benchmarks):
         raise ValueError('Wrong number of files: {} <-> {}'
                          ''.format(testfiles, benchmarks))
 
-    testtheta  = pyf.imageCache[testtheta[ 0]][1]
-    testphi    = pyf.imageCache[testphi[   0]][1]
-    benchtheta = pyf.imageCache[benchtheta[0]][1]
-    benchphi   = pyf.imageCache[benchphi[  0]][1]
+    testtheta  = loadImage(pyf, testtheta[ 0])[1]
+    testphi    = loadImage(pyf, testphi[   0])[1]
+    benchtheta = loadImage(pyf, benchtheta[0])[1]
+    benchphi   = loadImage(pyf, benchphi[  0])[1]
 
     if any((testphi   .shape != testtheta.shape,
             benchtheta.shape != testtheta.shape,
@@ -554,21 +583,19 @@ def evalPolarCoordinateImageGroup(pyf, testfiles, benchmarks):
     testvec  = nib.Nifti1Image(testvec,  np.eye(4), None)
     benchvec = nib.Nifti1Image(benchvec, np.eye(4), None)
 
-    name = 'evalPolarCoordinateImageGroup_' + '_'.join(testfiles)
-
-    pyf.imageCache[name + '_test']      = testvec
-    pyf.imageCache[name + '_benchmark'] = benchvec
-
-    return evalVectorImage(pyf, name + '_test', name + '_benchmark')
+    with common.tempdir():
+        nib.save(testvec,  'testvec.nii.gz')
+        nib.save(benchvec, 'benchvec.nii.gz')
+        return evalVectorImage('testvec.nii.gz', 'benchvec.nii.gz')
 
 
-def evalGiftiVertices(pyf, testfile, benchmark):
+def evalGiftiVertices(testfile, benchmark, pyf=None):
     """Compare the vertices of two GIFTI surfaces. Uses the :func:`cmpArrays`
     function.
     """
 
-    surf1 = pyf.imageCache[testfile][0]
-    surf2 = pyf.imageCache[benchmark][0]
+    surf1 = loadImage(pyf, testfile)[0]
+    surf2 = loadImage(pyf, benchmark)[0]
 
     # NIFTI_INTENT_POINTSET == 1008
     verts1 = [d for d in surf1.darrays if d.intent == 1008][0].data
diff --git a/pyfeeds/imagecache.py b/pyfeeds/imagecache.py
index a9a9a30a40f074e001d1a120d2f6c37c4e017cc5..aefca31b1b4cf479d1170892676466dd2d89aae8 100644
--- a/pyfeeds/imagecache.py
+++ b/pyfeeds/imagecache.py
@@ -72,8 +72,9 @@ class ImageCache:
     def __getitem__(self, imagefile):
         """Return the ``nibabel`` image corresponding to the given
         ``imagefile``, loading it if necessary. A tuple is returned,
-        containing the ``nibabel`` image, and a ``numpy``array with
-        the image data.
+        containing the ``nibabel`` image, and a ``numpy``array with the
+        image data. If ``filename`` is not a NIfTI image, the second
+        returned value will be ``None``.
         """
 
         imagefile = op.realpath(op.abspath(imagefile))