Source code for sumnplot.plot.matplotlib

"""Module for plotting summarised data with matplotlib."""

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from typing import List, Optional

from ..checks import check_type, check_condition


[docs]def plot_summarised_variable( summary_df: pd.DataFrame, axis_right: int, axis_left: Optional[List[int]] = None, title: Optional[str] = None, figsize_h: int = 14, figsize_w: int = 8, legend: bool = True, ): """Produce one way summary plot from pre-summarised data. Parameters ---------- summary_df : pd.DataFrame DataFrame with summarised info to plot. axis_right : int The index of the column in summary_df to plot on the right axis. Typically this would be a weights column. axis_left : Optional[List[int]], default = None The index of the columns in summary_df to plot on the left axis. Currently the maximum number of left axis columns supported is 5. title : str, default = None Title for the plot. If None summary_df.index.name is used as the title. figsize_h : int, default = 14 Height of plot figure, used in matplotlib.pylot.subplots figsize arg. figsize_w : int, default = 8 Width of plot figure, used in matplotlib.pylot.subplots figsize arg. legend : bool, default = True Should a legend be added to the plot? """ LEFT_Y_AXIS_COLOURS = ["magenta", "forestgreen", "lime", "orangered", "dodgerblue"] check_type(summary_df, pd.DataFrame, "summary_df") check_type(axis_right, int, "axis_right") check_type(axis_left, list, "axis_left", none_allowed=True) check_type(title, str, "title", none_allowed=True) check_type(figsize_h, int, "figsize_h", none_allowed=True) check_type(figsize_w, int, "figsize_w", none_allowed=True) check_type(legend, bool, "legend") check_condition( axis_right <= summary_df.shape[1] - 1, f"only {summary_df.shape[1]} columns in summary_df but axis_right = {axis_right}", ) if axis_left is not None: if axis_right in axis_left: raise ValueError( f"column index {axis_right} specified for both right and left axes" ) if len(axis_left) > len(LEFT_Y_AXIS_COLOURS): raise ValueError( f"only {len(LEFT_Y_AXIS_COLOURS)} plots supports for the left axis but {len(axis_left)} given" ) for axis_left_no, axis_left_index in enumerate(axis_left): check_type(axis_left_index, int, f"axis_left_index[{axis_left_no}]") check_condition( axis_left_index <= summary_df.shape[1] - 1, f"only {summary_df.shape[1]} columns in summary_df but axis_left[{axis_left_no}] = {axis_left_index}", ) if title is None: title = summary_df.index.name _, ax1 = plt.subplots(figsize=(figsize_h, figsize_w)) # plot bin counts on 1st axis ax1.bar( np.arange(summary_df.shape[0]), summary_df[summary_df.columns[axis_right]].reset_index(drop=True), color="gold", label=summary_df.columns[axis_right], ) plt.xticks(np.arange(summary_df.shape[0]), summary_df.index, rotation=270) ax2 = ax1.twinx() if axis_left is not None: for column_no, left_axis_column_index in enumerate(axis_left): ax2.plot( summary_df[summary_df.columns[left_axis_column_index]] .reset_index(drop=True) .dropna() .index, summary_df[summary_df.columns[left_axis_column_index]] .reset_index(drop=True) .dropna(), color=LEFT_Y_AXIS_COLOURS[column_no], linestyle="-", marker="D", label=summary_df.columns[left_axis_column_index], ) if legend: ax1.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0) if axis_left is not None: ax2.legend(bbox_to_anchor=(1.05, 0.94), loc=2, borderaxespad=0.0) plt.title(title, fontsize=20)
[docs]def plot_summarised_variable_2way( summary_df: pd.DataFrame, axis_right: int, axis_left: Optional[List[int]] = None, bar_type: Optional[str] = "stacked", bars_percent: Optional[bool] = False, title: Optional[str] = None, figsize_h: int = 14, figsize_w: int = 8, legend: bool = True, ): """Produce one way summary plot from pre-summarised data. Parameters ---------- summary_df : pd.DataFrame DataFrame with summarised info to plot. axis_right : int The index of the column in summary_df to plot on the right axis. Typically this would be a weights column. axis_left : Optional[List[int]], default = None The index of the columns in summary_df to plot on the left axis. Currently only 3 left axis lines are supported. bar_type : Optional[str], default = "stacked" Type of bars to plot on the right axis. Must be either "stacked" or "side_by_side". bars_percent : Optional[bool], default = False Should bars on the right axis be plotted as percentage of total within each bar? title : str, default = None Title for the plot. If None summary_df.index.name is used as the title. figsize_h : int, default = 14 Height of plot figure, used in matplotlib.pylot.subplots figsize arg. figsize_w : int, default = 8 Width of plot figure, used in matplotlib.pylot.subplots figsize arg. legend : bool, default = True Should a legend be added to the plot? """ BIN_COLOURS = [ "gold", "khaki", "goldenrod", "darkkhaki", "darkgoldenrod", "olive", "y", ] LEFT_AXIS_COLOURS = [ [ "magenta", "m", "orchid", "mediumvioletred", "deeppink", "darkmagenta", "darkviolet", ], [ "forestgreen", "darkgreen", "seagreen", "green", "darkseagreen", "g", "mediumseagreen", ], [ "lime", "limegreen", "greenyellow", "lawngreen", "chartreuse", "lightgreen", "springgreen", ], ] check_type(summary_df, pd.DataFrame, "summary_df") check_type(axis_right, int, "axis_right") check_type(axis_left, list, "axis_left", none_allowed=True) check_type(bar_type, str, "bar_type", none_allowed=True) check_type(bars_percent, bool, "bars_percent", none_allowed=True) check_type(title, str, "title", none_allowed=True) check_type(figsize_h, int, "figsize_h", none_allowed=True) check_type(figsize_w, int, "figsize_w", none_allowed=True) check_type(legend, bool, "legend") check_condition( axis_right <= summary_df.shape[1] - 1, f"only {summary_df.shape[1]} columns in summary_df but axis_right = {axis_right}", ) if axis_left is not None: if axis_right in axis_left: raise ValueError( f"column index {axis_right} specified for both right and left axes" ) if len(axis_left) > len(LEFT_AXIS_COLOURS): raise ValueError( f"only {len(LEFT_AXIS_COLOURS)} plots supported for the left axis but {len(axis_left)} given" ) for axis_left_no, axis_left_index in enumerate(axis_left): check_type(axis_left_index, int, f"axis_left_index[{axis_left_no}]") check_condition( axis_left_index <= summary_df.shape[1] - 1, f"only {summary_df.shape[1]} columns in summary_df but axis_left[{axis_left_no}] = {axis_left_index}", ) if len(summary_df.index.levels[1]) > len(BIN_COLOURS): raise ValueError( f"only {len(BIN_COLOURS)} levels supported for the second groupby column but {len(summary_df.index.levels[1])} given in summary_df" ) by_col = summary_df.index.names[0] split_by_col = summary_df.index.names[1] if title is None: title = f"{by_col} by {split_by_col}" _, ax1 = plt.subplots(figsize=(figsize_h, figsize_w)) # turn data into by_col x split_by_col table and fill in levels # with no weight (i.e. nulls) with 0 unstack_weights = summary_df[summary_df.columns[axis_right]].unstack() unstack_weights.fillna(0, inplace=True) if bars_percent: row_totals = unstack_weights.sum(axis=1) for col in unstack_weights.columns.values: unstack_weights[col] = unstack_weights[col] / row_totals split_levels = unstack_weights.columns.values unstack_weights.columns = pd.Index( [ "(" + split_by_col + " = " + str(x) + ") " + str(summary_df.columns[axis_right]) for x in unstack_weights.columns.values ] ) if bar_type == "stacked": top_bins = np.zeros(unstack_weights.shape[0]) # plot bin counts on 1st axis for i in range(unstack_weights.shape[1]): heights = unstack_weights.loc[ :, unstack_weights.columns.values[i] ].reset_index(drop=True) ax1.bar( x=np.arange(unstack_weights.shape[0]), height=heights, color=BIN_COLOURS[i], bottom=top_bins, label=unstack_weights.columns.values[i], ) top_bins = top_bins + heights plt.xticks( np.arange(unstack_weights.shape[0]), unstack_weights.index, rotation=270 ) x_ticket_offset = 0 elif bar_type == "side_by_side": bar_width = 0.8 / unstack_weights.shape[1] x_offset = 0 for i in range(unstack_weights.shape[1]): ax1.bar( np.arange(unstack_weights.shape[0]) + x_offset, unstack_weights.loc[:, unstack_weights.columns.values[i]].reset_index( drop=True ), color=BIN_COLOURS[i], width=bar_width, label=unstack_weights.columns.values[i], ) x_offset += bar_width x_ticket_offset = (bar_width * (unstack_weights.shape[1] / 2)) - ( bar_width * 0.5 ) plt.xticks( np.arange(unstack_weights.shape[0]) + x_ticket_offset, unstack_weights.index, rotation=270, ) else: raise ValueError(f"unexpected value for bar_type; {bar_type}") ax2 = ax1.twinx() if axis_left is not None: for column_no, axis_left_column_index in enumerate(axis_left): unstacked_left_axis_column = summary_df[ summary_df.columns[axis_left_column_index] ].unstack() unstacked_left_axis_column.columns = pd.Index( [ "(" + split_by_col + " = " + str(x) + ") " + str(summary_df.columns[axis_left_column_index]) for x in unstacked_left_axis_column.columns.values ] ) for i in range(unstacked_left_axis_column.shape[1]): ax2.plot( unstacked_left_axis_column.loc[ :, unstacked_left_axis_column.columns.values[i] ] .reset_index(drop=True) .dropna() .index + x_ticket_offset, unstacked_left_axis_column.loc[ :, unstacked_left_axis_column.columns.values[i] ] .reset_index(drop=True) .dropna(), color=LEFT_AXIS_COLOURS[column_no][i], linestyle="-", marker="D", label=unstacked_left_axis_column.columns.values[i], ) if legend: ax1.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0) if axis_left is not None: plt.legend( bbox_to_anchor=(1.05, (0.94 - (0.03 * len(split_levels)))), loc=2, borderaxespad=0.0, ) plt.title(title, fontsize=20)