.. note::
    :class: sphx-glr-download-link-note

    Click :ref:`here <sphx_glr_download_auto_examples_plot_missing_values.py>` to download the full example code
.. rst-class:: sphx-glr-example-title

.. _sphx_glr_auto_examples_plot_missing_values.py:


====================================================
Imputing missing values before building an estimator
====================================================

Missing values can be replaced by the mean, the median or the most frequent
value using the basic ``SimpleImputer``.
The median is a more robust estimator for data with high magnitude variables
which could dominate results (otherwise known as a 'long tail').

Another option is the ``ChainedImputer``. This uses round-robin linear
regression, treating every variable as an output in turn. The version
implemented assumes Gaussian (output) variables. If your features are obviously
non-Normal, consider transforming them to look more Normal so as to improve
performance.



.. code-block:: python


    import numpy as np
    import matplotlib.pyplot as plt

    from sklearn.datasets import load_diabetes
    from sklearn.datasets import load_boston
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.pipeline import Pipeline
    from sklearn.impute import SimpleImputer, ChainedImputer
    from sklearn.model_selection import cross_val_score

    rng = np.random.RandomState(0)


    def get_results(dataset):
        X_full, y_full = dataset.data, dataset.target
        n_samples = X_full.shape[0]
        n_features = X_full.shape[1]

        # Estimate the score on the entire dataset, with no missing values
        estimator = RandomForestRegressor(random_state=0, n_estimators=100)
        full_scores = cross_val_score(estimator, X_full, y_full,
                                      scoring='neg_mean_squared_error')

        # Add missing values in 75% of the lines
        missing_rate = 0.75
        n_missing_samples = int(np.floor(n_samples * missing_rate))
        missing_samples = np.hstack((np.zeros(n_samples - n_missing_samples,
                                              dtype=np.bool),
                                     np.ones(n_missing_samples,
                                             dtype=np.bool)))
        rng.shuffle(missing_samples)
        missing_features = rng.randint(0, n_features, n_missing_samples)

        # Estimate the score after replacing missing values by 0
        X_missing = X_full.copy()
        X_missing[np.where(missing_samples)[0], missing_features] = 0
        y_missing = y_full.copy()
        estimator = RandomForestRegressor(random_state=0, n_estimators=100)
        zero_impute_scores = cross_val_score(estimator, X_missing, y_missing,
                                             scoring='neg_mean_squared_error')

        # Estimate the score after imputation (mean strategy) of the missing values
        X_missing = X_full.copy()
        X_missing[np.where(missing_samples)[0], missing_features] = 0
        y_missing = y_full.copy()
        estimator = Pipeline([("imputer", SimpleImputer(missing_values=0,
                                                        strategy="mean")),
                              ("forest", RandomForestRegressor(random_state=0,
                                                               n_estimators=100))])
        mean_impute_scores = cross_val_score(estimator, X_missing, y_missing,
                                             scoring='neg_mean_squared_error')

        # Estimate the score after chained imputation of the missing values
        estimator = Pipeline([("imputer", ChainedImputer(missing_values=0,
                                                         random_state=0)),
                              ("forest", RandomForestRegressor(random_state=0,
                                                               n_estimators=100))])
        chained_impute_scores = cross_val_score(estimator, X_missing, y_missing,
                                                scoring='neg_mean_squared_error')

        return ((full_scores.mean(), full_scores.std()),
                (zero_impute_scores.mean(), zero_impute_scores.std()),
                (mean_impute_scores.mean(), mean_impute_scores.std()),
                (chained_impute_scores.mean(), chained_impute_scores.std()))


    results_diabetes = np.array(get_results(load_diabetes()))
    mses_diabetes = results_diabetes[:, 0] * -1
    stds_diabetes = results_diabetes[:, 1]

    results_boston = np.array(get_results(load_boston()))
    mses_boston = results_boston[:, 0] * -1
    stds_boston = results_boston[:, 1]

    n_bars = len(mses_diabetes)
    xval = np.arange(n_bars)

    x_labels = ['Full data',
                'Zero imputation',
                'Mean Imputation',
                'Chained Imputation']
    colors = ['r', 'g', 'b', 'orange']

    # plot diabetes results
    plt.figure(figsize=(12, 6))
    ax1 = plt.subplot(121)
    for j in xval:
        ax1.barh(j, mses_diabetes[j], xerr=stds_diabetes[j],
                 color=colors[j], alpha=0.6, align='center')

    ax1.set_title('Imputation Techniques with Diabetes Data')
    ax1.set_xlim(left=np.min(mses_diabetes) * 0.9,
                 right=np.max(mses_diabetes) * 1.1)
    ax1.set_yticks(xval)
    ax1.set_xlabel('MSE')
    ax1.invert_yaxis()
    ax1.set_yticklabels(x_labels)

    # plot boston results
    ax2 = plt.subplot(122)
    for j in xval:
        ax2.barh(j, mses_boston[j], xerr=stds_boston[j],
                 color=colors[j], alpha=0.6, align='center')

    ax2.set_title('Imputation Techniques with Boston Data')
    ax2.set_yticks(xval)
    ax2.set_xlabel('MSE')
    ax2.invert_yaxis()
    ax2.set_yticklabels([''] * n_bars)

    plt.show()

**Total running time of the script:** ( 0 minutes  0.000 seconds)


.. _sphx_glr_download_auto_examples_plot_missing_values.py:


.. only :: html

 .. container:: sphx-glr-footer
    :class: sphx-glr-footer-example



  .. container:: sphx-glr-download

     :download:`Download Python source code: plot_missing_values.py <plot_missing_values.py>`



  .. container:: sphx-glr-download

     :download:`Download Jupyter notebook: plot_missing_values.ipynb <plot_missing_values.ipynb>`


.. only:: html

 .. rst-class:: sphx-glr-signature

    `Gallery generated by Sphinx-Gallery <https://sphinx-gallery.readthedocs.io>`_
