To make sure the Vega charts render correctly, view the notebook not from the Github repo but the published website here: https://walterra.github.io/jupyter2kibana/viz-3a-iris-classification.html

viz-3a-iris-classification.ipynb¶

This notebook demonstrates how small multiples of stacked bar chart histograms can be used to evaluate machine learning results, in this case comparing original values to predicted categorizations.

import datetime
import altair as alt
import eland as ed
from elasticsearch import Elasticsearch
import elastic_transport
import logging
import json
import numpy as np
import matplotlib.pyplot as plt
import urllib3
import warnings

alt.data_transformers.disable_max_rows()
logging.getLogger("elastic_transport").setLevel(logging.ERROR)

# Suppress insecure SSL connection warnings
# In dev environments with the `verify_certs=False`
# you might want to reduce those warnings.
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
urllib3.disable_warnings(elastic_transport.SecurityWarning)

# For rendering the notebook to HTML hide all warnings
warnings.filterwarnings('ignore')

from vega_datasets import data
pd_df = data.iris()
pd_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   sepalLength  150 non-null    float64
 1   sepalWidth   150 non-null    float64
 2   petalLength  150 non-null    float64
 3   petalWidth   150 non-null    float64
 4   species      150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB

The next cell takes the iris dataset from VEGA's examples and pushes it to an Elasticsearch index using eland.

with open('config.json') as config_file:
  es_config = json.load(config_file)

# First instantiate an 'Elasticsearch' instance with the supplied config
es = Elasticsearch(
    hosts=[es_config['es_client']],
    basic_auth=[
        es_config['user'],
        es_config['password']
    ],
    # Only in development environments with self signed certificates fall back to use `verify_certs=False`
    verify_certs=False
)

ed_df = ed.pandas_to_eland(
    pd_df.dropna(),
    es,
    'eland_iris',
    es_if_exists="replace",
    es_refresh=True
)
ed_df.columns

Index(['petalLength', 'petalWidth', 'sepalLength', 'sepalWidth', 'species'], dtype='object')

Before you can run the next cell, you need to run a Machine Learning job doing classification in Elasticsearch using the following configuration. It will write the results to an index iris_classification_3 which we can then access in the following cell.

{
  "id": "iris_classification_3",
  "create_time": 1604478053294,
  "version": "8.0.0",
  "description": "",
  "source": {
    "index": [
      "eland_iris"
    ],
    "query": {
      "match_all": {}
    }
  },
  "dest": {
    "index": "iris_classification_3",
    "results_field": "ml"
  },
  "analysis": {
    "classification": {
      "dependent_variable": "species",
      "num_top_feature_importance_values": 10,
      "class_assignment_objective": "maximize_minimum_recall",
      "num_top_classes": 2,
      "prediction_field_name": "species_prediction",
      "training_percent": 80,
      "randomize_seed": -2245814554146464500
    }
  },
  "analyzed_fields": {
    "includes": [],
    "excludes": []
  },
  "model_memory_limit": "100mb",
  "allow_lazy_start": false,
  "max_num_threads": 1
}

index_name='iris_classification_3'
ed_df = ed.DataFrame(es, index_name)
ed_df.head()

# Note: To create the Vega spec using Altair we reference ES via URL first. This will only work
# for non-secured ES instances. If your ES instance runs using SSL and/or authentication the chart
# in this cell will render empty. You can still save the visualization in Kibana correctly in the
# next cell because there the URL gets replaced with an Elasticsearch query
# to be used via the Kibana Vega plugin.

# WARNING:
# Do the following approach using a proxy only for demo purposes in a development environment.
# It will expose a secured ES instance unsecured!
# To make this work for demo purposes run the nodejs based proxy in a separate terminal like this:
# NODE_TLS_REJECT_UNAUTHORIZED='0' node proxy

# URL as ES endpoint
# url = 'http://localhost:9220/'+index_name+'/_search?size=1000'

# URL static fallback
url = 'https://walterra.github.io/jupyter2kibana/data/iris_classification.json'

url_data = alt.Data(url=url, format=alt.DataFormat(property='hits.hits',type='json'))

fields = ['petalLength', 'petalWidth', 'sepalLength', 'sepalWidth']

rename_dict = dict((a, 'datum._source.'+a) for a in fields)

def small_multiples(split='species'):
    url_chart = alt.Chart(url_data).transform_calculate(
        species_prediction='datum._source.ml.species_prediction',
        species='datum._source.species'
    ).transform_calculate(**rename_dict).transform_fold(
        fields,
        as_=['attribute', 'value']
    ).mark_bar().encode(
        alt.X('value:Q', title='', bin=True),
        alt.Y('count()', title=''),
        tooltip=[
            alt.Tooltip('value:Q', bin=True, title='x'),
            alt.Tooltip('count()', title='y')
        ],
        color=split+':N'
    ).properties(
        width=150,
        height=150
    )

    url_charts = alt.ConcatChart(
        concat=[
          url_chart.transform_filter(alt.datum.attribute == attribute).properties(title=attribute)
          for attribute in sorted(['petalLength', 'petalWidth', 'sepalLength', 'sepalWidth'])
        ],
        columns=2
    ).resolve_axis(
        x='independent',
        y='independent'
    ).resolve_scale(
        x='independent', 
        y='independent'
    )
    
    return url_charts

chart_raw = small_multiples('species')
chart_raw

chart_prediction = small_multiples('species_prediction')
chart_prediction

alt.Chart(url_data).transform_calculate(
    species_prediction='datum._source.ml.species_prediction',
    species='datum._source.species'
).transform_calculate(**rename_dict).mark_circle(size=30).encode(
    x='petalLength:Q',
    y='petalWidth:Q',
    color='species:N',
    tooltip=['species:N']
)

from kibana_vega_util import saveVegaLiteVis

saveVegaLiteVis(
    index_name,
    'iris-histogram-raw-1',
    chart_raw,
    resultSize=10000,
    timeField=False,
    # Only in development environments with self signed certificates fall back to use `verify=False`
    verify=False
)
saveVegaLiteVis(
    index_name,
    'iris-histogram-prediction-1',
    chart_prediction,
    resultSize=10000,
    timeField=False,
    # Only in development environments with self signed certificates fall back to use `verify=False`
    verify=False
)

<Response [409]>

	ml.feature_importance	ml.feature_importance.classes	ml.feature_importance.classes.class_name	ml.feature_importance.classes.importance	ml.feature_importance.feature_name	ml.is_training	ml.prediction_probability	ml.prediction_score	ml.species_prediction	ml.top_classes	ml.top_classes.class_name	ml.top_classes.class_probability	ml.top_classes.class_score	ml__incremental_id	petalLength	petalWidth	sepalLength	sepalWidth	species
0	[{'feature_name': 'petalLength', 'classes': [{...	NaN	NaN	NaN	NaN	True	0.993851	0.171765	setosa	[{'class_probability': 0.9938514843172568, 'cl...	NaN	NaN	NaN	0	1.4	0.2	5.1	3.5	setosa
1	[{'feature_name': 'petalLength', 'classes': [{...	NaN	NaN	NaN	NaN	True	0.997786	0.172445	setosa	[{'class_probability': 0.9977861220921344, 'cl...	NaN	NaN	NaN	1	1.4	0.2	4.9	3.0	setosa
2	[{'feature_name': 'petalLength', 'classes': [{...	NaN	NaN	NaN	NaN	True	0.997786	0.172445	setosa	[{'class_probability': 0.9977861220921344, 'cl...	NaN	NaN	NaN	2	1.3	0.2	4.7	3.2	setosa
3	[{'feature_name': 'petalLength', 'classes': [{...	NaN	NaN	NaN	NaN	True	0.997786	0.172445	setosa	[{'class_probability': 0.9977861220921344, 'cl...	NaN	NaN	NaN	3	1.5	0.2	4.6	3.1	setosa
4	[{'feature_name': 'petalLength', 'classes': [{...	NaN	NaN	NaN	NaN	True	0.997510	0.172397	setosa	[{'class_probability': 0.997510480108977, 'cla...	NaN	NaN	NaN	4	1.4	0.2	5.0	3.6	setosa