Using Data Description Explainer

This example illustrates how to use the Data Description to interpret a data set. (See the fatf.transparency.data.describe_functions module for more details.)

# Author: Kacper Sokol <k.sokol@bristol.ac.uk>
# License: new BSD

from pprint import pprint
import numpy as np

import fatf.utils.data.datasets as fatf_datasets

import fatf.transparency.data.describe_functions as fatf_dd

print(__doc__)

# Load data
iris_data_dict = fatf_datasets.load_iris()
iris_X = iris_data_dict['data']
iris_y = iris_data_dict['target'].astype(int)
iris_feature_names = iris_data_dict['feature_names']
iris_class_names = iris_data_dict['target_names']

Start by describing all of the features in the data set.

# Explain all of the features
features_description = fatf_dd.describe_array(iris_X)

# Conver feature ids into feature names
named_features_description = dict()
for fdi in features_description.items():
    feature_id, feature_description = fdi
    feature_name = iris_feature_names[feature_id]

    named_features_description[feature_name] = feature_description

print('Data Description for each feature:')
pprint(named_features_description)

Out:

Data Description for each feature:
{'petal length (cm)': {'25%': 1.600000023841858,
                       '50%': 4.3500001430511475,
                       '75%': 5.099999904632568,
                       'count': 150,
                       'max': 6.9,
                       'mean': 3.7580001,
                       'min': 1.0,
                       'nan_count': 0,
                       'std': 1.7594041},
 'petal width (cm)': {'25%': 0.30000001192092896,
                      '50%': 1.2999999523162842,
                      '75%': 1.7999999523162842,
                      'count': 150,
                      'max': 2.5,
                      'mean': 1.1993333,
                      'min': 0.1,
                      'nan_count': 0,
                      'std': 0.7596926},
 'sepal length (cm)': {'25%': 5.099999904632568,
                       '50%': 5.800000190734863,
                       '75%': 6.400000095367432,
                       'count': 150,
                       'max': 7.9,
                       'mean': 5.8433332,
                       'min': 4.3,
                       'nan_count': 0,
                       'std': 0.8253013},
 'sepal width (cm)': {'25%': 2.799999952316284,
                      '50%': 3.0,
                      '75%': 3.299999952316284,
                      'count': 150,
                      'max': 4.4,
                      'mean': 3.0573335,
                      'min': 2.0,
                      'nan_count': 0,
                      'std': 0.434411}}

Now describe the ‘petal width (cm)’ per class.

# Select the 'petal width (cm)' feature
selected_feature_id = 3
selected_feature_name = iris_feature_names[selected_feature_id]

# Group the data points per class
per_class_row_mask = dict()
for class_index, class_name in enumerate(iris_class_names):
    per_class_row_mask[class_name] = iris_y == class_index

# Explain the 'petal width (cm)' feature per class
per_class_explanation = dict()
for class_name, class_mask in per_class_row_mask.items():
    class_array = iris_X[class_mask, selected_feature_id]
    per_class_explanation[class_name] = fatf_dd.describe_array(class_array)

print('Per-class Data Description of each feature for class '
      "'{}' (class index {}):".format(selected_feature_name,
                                      selected_feature_id))
pprint(per_class_explanation)

Out:

Per-class Data Description of each feature for class 'petal width (cm)' (class index 3):
{'setosa': {'25%': 0.20000000298023224,
            '50%': 0.20000000298023224,
            '75%': 0.30000001192092896,
            'count': 50,
            'max': 0.6,
            'mean': 0.246,
            'min': 0.1,
            'nan_count': 0,
            'std': 0.10432641},
 'versicolor': {'25%': 1.2000000476837158,
                '50%': 1.2999999523162842,
                '75%': 1.5,
                'count': 50,
                'max': 1.8,
                'mean': 1.3260001,
                'min': 1.0,
                'nan_count': 0,
                'std': 0.19576517},
 'virginica': {'25%': 1.7999999523162842,
               '50%': 2.0,
               '75%': 2.299999952316284,
               'count': 50,
               'max': 2.5,
               'mean': 2.026,
               'min': 1.4,
               'nan_count': 0,
               'std': 0.2718897}}

Finally, describe the class distribution.

# Get the Data Description for the target variable
target_explanation = fatf_dd.describe_categorical_array(iris_y)

print('Data Description of the target array:')
pprint(target_explanation)

# Since the targer array is numerical, we can convert it to class names first
iris_y_named = np.array([iris_class_names[i] for i in iris_y])
target_explanation_named = fatf_dd.describe_categorical_array(iris_y_named)

print('Data Description of the target array mapped to class names:')
pprint(target_explanation_named)

Out:

Data Description of the target array:
{'count': 150,
 'freq': 50,
 'is_top_unique': False,
 'top': '0',
 'unique': array(['0', '1', '2'], dtype='<U21'),
 'unique_counts': array([50, 50, 50])}
Data Description of the target array mapped to class names:
{'count': 150,
 'freq': 50,
 'is_top_unique': False,
 'top': 'setosa',
 'unique': array(['setosa', 'versicolor', 'virginica'], dtype='<U10'),
 'unique_counts': array([50, 50, 50])}

Total running time of the script: ( 0 minutes 0.053 seconds)

Gallery generated by Sphinx-Gallery