python-guide
/
mnist_loader.py

import gzip
import os
import struct
from array import array
import random

_allowed_modes = (
    # integer values in {0..255}
    'vanilla',

    # integer values in {0,1}
    # values set at 1 (instead of 0) with probability p = orig/255
    # as in Ruslan Salakhutdinov and Iain Murray's paper
    # 'On The Quantitative Analysis of Deep Belief Network' (2008)
    'randomly_binarized',

    # integer values in {0,1}
    # values set at 1 (instead of 0) if orig/255 > 0.5
    'rounded_binarized',
)

_allowed_return_types = (
    # default return type. Computationally more expensive.
    # Useful if numpy is not installed.
    'lists',

    # Numpy module will be dynamically loaded on demand.
    'numpy',
)

np = None
def _import_numpy():
    # will be called only when the numpy return type has been specifically
    # requested via the 'return_type' parameter in MNIST class' constructor.
    global np
    if np is None: # import only once
        try:
            import numpy as _np
        except ImportError as e:
            raise MNISTException(
                "need to have numpy installed to return numpy arrays."\
                +" Otherwise, please set return_type='lists' in constructor."
            )
        np = _np
    else:
        pass # was already previously imported
    return np

class MNISTException(Exception):
    pass

class MNIST(object):
    def __init__(self, path='.', mode='vanilla', return_type='lists', gz=False):
        self.path = path

        assert mode in _allowed_modes, \
            "selected mode '{}' not in {}".format(mode,_allowed_modes)

        self._mode = mode

        assert return_type in _allowed_return_types, \
            "selected return_type '{}' not in {}".format(
                return_type,
                _allowed_return_types
            )

        self._return_type = return_type

        self.test_img_fname = 't10k-images-idx3-ubyte'
        self.test_lbl_fname = 't10k-labels-idx1-ubyte'

        self.train_img_fname = 'train-images-idx3-ubyte'
        self.train_lbl_fname = 'train-labels-idx1-ubyte'

        self.gz = gz
        self.emnistRotate = False

        self.test_images = []
        self.test_labels = []

        self.train_images = []
        self.train_labels = []

    def select_emnist(self, dataset='digits'):
        '''
        Select one of the EMNIST datasets

        Available datasets:
            - balanced
            - byclass
            - bymerge
            - digits
            - letters
            - mnist
        '''
        template = 'emnist-{0}-{1}-{2}-idx{3}-ubyte'

        self.gz = True
        self.emnistRotate = True

        self.test_img_fname = template.format(dataset, 'test', 'images', 3)
        self.test_lbl_fname = template.format(dataset, 'test', 'labels', 1)

        self.train_img_fname = template.format(dataset, 'train', 'images', 3)
        self.train_lbl_fname = template.format(dataset, 'train', 'labels', 1)

    @property # read only because set only once, via constructor
    def mode(self):
        return self._mode

    @property # read only because set only once, via constructor
    def return_type(self):
        return self._return_type

    def load_testing(self):
        ims, labels = self.load(os.path.join(self.path, self.test_img_fname),
                                os.path.join(self.path, self.test_lbl_fname))

        self.test_images = self.process_images(ims)
        self.test_labels = self.process_labels(labels)

        return self.test_images, self.test_labels

    def load_training(self):
        ims, labels = self.load(os.path.join(self.path, self.train_img_fname),
                                os.path.join(self.path, self.train_lbl_fname))

        self.train_images = self.process_images(ims)
        self.train_labels = self.process_labels(labels)

        return self.train_images, self.train_labels

    def load_training_in_batches(self, batch_size):
        if type(batch_size) is not int:
            raise ValueError('batch_size must be a int number')
        batch_sp = 0
        last = False
        self._get_dataset_size(os.path.join(self.path, self.train_img_fname),
                               os.path.join(self.path, self.train_lbl_fname))

        while True:
            ims, labels = self.load(
                os.path.join(self.path, self.train_img_fname),
                os.path.join(self.path, self.train_lbl_fname),
                batch=[batch_sp, batch_size])

            self.train_images = self.process_images(ims)
            self.train_labels = self.process_labels(labels)

            yield self.train_images, self.train_labels

            if last:
                break

            batch_sp += batch_size
            if batch_sp + batch_size > self.dataset_size:
                last = True
                batch_size = self.dataset_size - batch_sp

    def _get_dataset_size(self, path_img, path_lbl):
        with self.opener(path_lbl, 'rb') as file:
            magic, lb_size = struct.unpack(">II", file.read(8))
            if magic != 2049:
                raise ValueError('Magic number mismatch, expected 2049,'
                                 'got {}'.format(magic))

        with self.opener(path_img, 'rb') as file:
            magic, im_size = struct.unpack(">II", file.read(8))
            if magic != 2051:
                raise ValueError('Magic number mismatch, expected 2051,'
                                 'got {}'.format(magic))

        if lb_size != im_size:
            raise ValueError('image size is not equal to label size')

        self.dataset_size = lb_size

    def process_images(self, images):
        if self.return_type is 'lists':
            return self.process_images_to_lists(images)
        elif self.return_type is 'numpy':
            return self.process_images_to_numpy(images)
        else:
            raise MNISTException("unknown return_type '{}'".format(self.return_type))

    def process_labels(self, labels):
        if self.return_type is 'lists':
            return labels
        elif self.return_type is 'numpy':
            _np = _import_numpy()
            return _np.array(labels)
        else:
            raise MNISTException("unknown return_type '{}'".format(self.return_type))

    def process_images_to_numpy(self,images):
        _np = _import_numpy()

        images_np = _np.array(images)

        if self.mode == 'vanilla':
            pass # no processing, return them vanilla

        elif self.mode == 'randomly_binarized':
            r = _np.random.random(images_np.shape)
            images_np = (r <= ( images_np / 255)).astype('int') # bool to 0/1

        elif self.mode == 'rounded_binarized':
            images_np = ((images_np / 255) > 0.5).astype('int') # bool to 0/1

        else:
            raise MNISTException("unknown mode '{}'".format(self.mode))

        return images_np

    def process_images_to_lists(self,images):
        if self.mode == 'vanilla':
            pass # no processing, return them vanilla

        elif self.mode == 'randomly_binarized':
            for i in range(len(images)):
                for j in range(len(images[i])):
                    pixel = images[i][j]
                    images[i][j] = int(random.random() <= pixel/255) # bool to 0/1

        elif self.mode == 'rounded_binarized':
            for i in range(len(images)):
                for j in range(len(images[i])):
                    pixel = images[i][j]
                    images[i][j] = int(pixel/255 > 0.5) # bool to 0/1
        else:
            raise MNISTException("unknown mode '{}'".format(self.mode))

        return images

    def opener(self, path_fn, *args, **kwargs):
        if self.gz:
            return gzip.open(path_fn + '.gz', *args, **kwargs)
        else:
            return open(path_fn, *args, **kwargs)

    def load(self, path_img, path_lbl, batch=None):
        if batch is not None:
            if type(batch) is not list or len(batch) is not 2:
                raise ValueError('batch should be a 1-D list'
                                 '(start_point, batch_size)')

        with self.opener(path_lbl, 'rb') as file:
            magic, size = struct.unpack(">II", file.read(8))
            if magic != 2049:
                raise ValueError('Magic number mismatch, expected 2049,'
                                 'got {}'.format(magic))

            labels = array("B", file.read())

        with self.opener(path_img, 'rb') as file:
            magic, size, rows, cols = struct.unpack(">IIII", file.read(16))
            if magic != 2051:
                raise ValueError('Magic number mismatch, expected 2051,'
                                 'got {}'.format(magic))

            image_data = array("B", file.read())

        if batch is not None:
            image_data = image_data[batch[0] * rows * cols:\
                                    (batch[0] + batch[1]) * rows * cols]
            labels = labels[batch[0]: batch[0] + batch[1]]
            size = batch[1]

        images = []
        for i in range(size):
            images.append([0] * rows * cols)

        for i in range(size):
            images[i][:] = image_data[i * rows * cols:(i + 1) * rows * cols]

            # for some reason EMNIST is mirrored and rotated
            if self.emnistRotate:
                x = image_data[i * rows * cols:(i + 1) * rows * cols]

                subs = []
                for r in range(rows):
                    subs.append(x[(rows - r) * cols - cols:(rows - r)*cols])

                l = list(zip(*reversed(subs)))
                fixed = [item for sublist in l for item in sublist]

                images[i][:] = fixed

        return images, labels

    @classmethod
    def display(cls, img, width=28, threshold=200):
        render = ''
        for i in range(len(img)):
            if i % width == 0:
                render += '\n'
            if img[i] > threshold:
                render += '@'
            else:
                render += '.'
        return render