Reduce dataframe memory usage

import numpy as np
import pandas as pd

AVAILABLE_SIZES = {
    'int': [8, 16, 32, 64],
    'uint': [8, 16, 32, 64],
    'float': [16, 32, 64],
}


def _select_best_int_type(c_min, c_max):
    for base_type in ('uint', 'int'):
        for bits in sorted(AVAILABLE_SIZES[base_type]):
            np_type = '%s%d' % (base_type, bits)
            info = np.iinfo(np_type)
            if info.min <= c_min and c_max <= info.max:
                return np_type
    return None


def _select_best_float_type(c_min, c_max, allow_float16):
    base_type = 'float'
    for bits in sorted(AVAILABLE_SIZES[base_type]):
        if not allow_float16 and bits == 16:
            continue
        np_type = '%s%d' % (base_type, bits)
        info = np.finfo(np_type)
        if c_min >= info.min and c_max <= info.max:
            return np_type
    return None


def reduce_memory_usage(
        data: pd.DataFrame,
        verbose: bool = True,
        allow_float16: bool = True) -> None:
    """
    This function will modify the df inplace
    in order to reduce RAM impact
    :param data: the DataFrame to be reduced
    :param verbose: print the small report
    :param allow_float16: allow half-precision float16 type
    """
    start_mem = data.memory_usage().sum() / 1024 ** 2
    if verbose:
        print('Memory usage of dataframe: {:.2f} MB'.format(start_mem))

    for col_name, col_type in data.dtypes.items():
        if col_type != object:
            column = data[col_name]

            c_min = column.min()
            c_max = column.max()

            if str(col_type).startswith('float'):
                best = _select_best_float_type(c_min, c_max, allow_float16)
            else:  # int, uint
                best = _select_best_int_type(c_min, c_max)

            if best:
                data[col_name] = column.astype(best)

    end_mem = data.memory_usage().sum() / 1024 ** 2
    if verbose:
        print('Memory usage after optimization: {:.2f} MB'.format(end_mem))
        print('Decreased by {:.1f}%'.format(
            100 * (start_mem - end_mem) / start_mem)
        )

Last updated

Was this helpful?