# PyParis logo
from IPython.display import Image
Image("PyParis.png")
Python & data Science
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
# display options
pd.set_option("display.max_rows", 16)
pd.set_option("display.max_columns", 30)
Matplotlib is a Python 2D plotting library which produces publication quality figures in a variety of hardcopy formats and interactive environments across platforms.
In matplotlib.pyplot
the 3 main objects are:
Figure
: The top level container for all the plot elements.
Axes
(ou Subplots): The Axes contains most of the figure elements and sets the coordinate system.
Axis
: X or Y axis of a graphics, different from Axes.
Nota bene: all instructions from the creation of a figure to its display are cumulated in the same graphics in a script or in a cell of a notebook.
# style
plt.style.use('seaborn-darkgrid')
plt.subplots(figsize=(5, 5));
# available styles in matplotlib.pyplot.style.available
print(*plt.style.available, sep=' ')
# styling with context manager
with plt.style.context('fivethirtyeight'):
plt.subplots(figsize=(5, 5))
# pseudo-random walk
#np.random.seed(0)
plt.plot((np.random.random(100) - 0.5).cumsum());
# a figure with a unique subplot
fig = plt.figure(figsize=(8, 6))
ax = fig.add_subplot(111) # equivalent to ax = fig.add_subplot(1, 1, 1)
ax.set_title("Figure 1")
ax.plot((np.random.random(100) - 0.5).cumsum())
ax.axhline(y=0, color='k')
ax.legend(["Random walk"]);
# %load exercises/ex1.py
def plot_random_walk2():
fig = plt.figure(figsize=(8, 6))
ax = fig.add_subplot(111) # equivalent to ax = fig.add_subplot(1, 1, 1)
ax.set_title("Figure 1")
a = (np.random.random(100) - 0.5).cumsum()
b = (np.random.random(100) - 0.5).cumsum()
ax.plot(a, c='g')
ax.plot(b, c='r')
ax.axhline(y=a.mean(), color='g', ls=':')
ax.axhline(y=b.mean(), color='r', ls=':')
ax.legend(["Random walk 1", "Random walk 2"]);
plot_random_walk2()
Parameter loc
, default value best
.
It is also possible to set a relative position with the option bbox_to_anchor=(x, y)
:
# %load exercises/ex2.py
def plot_random_walk2(**kwargs):
fig = plt.figure(figsize=(8, 6))
ax = fig.add_subplot(111) # equivalent to ax = fig.add_subplot(1, 1, 1)
ax.set_title("Figure 1")
a = (np.random.random(100) - 0.5).cumsum()
b = (np.random.random(100) - 0.5).cumsum()
ax.plot(a, c='g')
ax.plot(b, c='r')
ax.axhline(y=a.mean(), color='g', ls=':')
ax.axhline(y=b.mean(), color='r', ls=':')
ax.legend(["Random walk 1", "Random walk 2"], **kwargs);
plot_random_walk2(loc='lower left')
# plot_random_walk2(bbox_to_anchor=(1.3, 0.6))
1) With the add_subplot()
method.
# compound graphics
fig = plt.figure(figsize=(8, 6))
ax1 = fig.add_subplot(221)
ax1.set_title("Figure 1")
ax1.plot(np.random.random(10))
ax2 = fig.add_subplot(222)
ax2.set_title("Figure 2")
ax2.plot(np.random.random(10), 'r--')
ax3 = fig.add_subplot(223)
ax3.set_title("Figure 3")
x = np.random.random(10)
ax3.plot(x, 'c:')
ax3.plot(x, '*', color='darkred')
ax4 = fig.add_subplot(224)
ax4.set_title("Figure 4")
x = np.random.random(10)
ax4.plot(x, '-.', color='0.3')
ax4.plot(x, '^', color='#ff0080');
2) With the subplots()
function.
# compound graphics
fig, [[ax1, ax2], [ax3, ax4]] = plt.subplots(2, 2, figsize=(8, 6))
ax1.set_title("Figure 1")
ax1.plot(np.random.random(10))
ax2.set_title("Figure 2")
ax2.plot(np.random.random(10), 'r--')
ax3.set_title("Figure 3")
x = np.random.random(10)
ax3.plot(x, 'c:')
ax3.plot(x, '*', color='darkred')
ax4.set_title("Figure 4")
x = np.random.random(10)
ax4.plot(x, '-.', color='0.3')
ax4.plot(x, '^', color='#ff0080');
In matplotlib there are:
matplotlib.colors.cnames
matplotlib.lines.Line2D.markers
lw
keyword.print(*mpl.colors.cnames, sep=' ')
print(*mpl.lines.Line2D.markers, sep=' ')
# %load exercises/ex3.py
# compound graphics
fig, [[ax1, ax2, ax3], [ax4, ax5, ax6]] = plt.subplots(2, 3, figsize=(12, 6))
ax1.set_title("Figure 1")
ax1.plot(np.random.random(10))
ax2.set_title("Figure 2")
ax2.plot(np.random.random(10), 'r--')
ax3.set_title("Figure 3")
x = np.random.random(10)
ax3.plot(x, 'c:')
ax3.plot(x, '*', color='darkred')
ax4.set_title("Figure 4")
x = np.random.random(10)
ax4.plot(x, '-.', color='0.3')
ax4.plot(x, '^', color='#ff0080')
ax5.set_title("Figure 5")
x = np.random.random(10)
ax5.plot(x, '-', color='gold', lw=2)
ax5.plot(x, 's', color='k')
ax6.set_title("Figure 6")
x = np.random.random(10)
ax6.plot(x, '--', color='0.7')
ax6.plot(x, 'o', color='b');
# load a set of data
df = pd.read_table('Summer Olympic medallists 1896 to 2008 - ALL MEDALISTS.txt')
df.head()
# setting category on medals
medals = ['Bronze', 'Silver', 'Gold']
if pd.__version__ < '0.21.0':
df['Medal'] = geo['Medal'].astype('category', categories=medals, ordered=True)
else:
from pandas.api.types import CategoricalDtype
cat_medals = CategoricalDtype(categories=medals, ordered=True)
df['Medal'] = df['Medal'].astype(cat_medals)
df.info()
# cross table Edition x Medal
table = pd.crosstab(df['Edition'], df['Medal'])
table
# plot all medals in same graphics
ax = table.plot(kind='bar',
figsize=(12, 4),
title="Medals by edition and metal")
ax.set_xticks(range(len(table)))
ax.set_xlabel("Editions")
ax.set_xticklabels(table.index);
# %load exercises/ex4.py
# for each edition the ratio of medals by gender
table1 = pd.crosstab(df['Edition'], df['Gender'])
table1 = table1.div(table1.sum(axis=1), axis=0)
ax = table1.plot(kind='bar',
stacked=True,
figsize=(12, 4),
title="Medals by edition and gender")
ax.set_xticks(range(len(table1)))
ax.set_xlabel("Editions")
ax.set_xticklabels(table1.index);
# for each sport the number of medals by metal
table2 = pd.crosstab(df['Sport'], df['Medal'])
ax = table2.plot(kind='bar',
figsize=(12, 4),
title="Medals by edition and metal")
ax.set_xticks(range(len(table2)))
ax.set_xlabel("Sports")
ax.set_xticklabels(table2.index);
# %load exercises/ex5.py
# graphics with subplots
table1 = pd.crosstab(df['Edition'], df['Medal'])
axes = table1.plot(figsize=(9, 6),
title="Medals by metal and edition",
kind='bar',
subplots=True,
#sharey=True,
color=['darkorange', 'silver', 'gold'],
rot=60)
plt.subplots_adjust(hspace=0.4)
axes[-1].set_xticks(range(len(table1)))
axes[-1].set_xlabel("Editions")
axes[-1].set_xticklabels(table1.index)
for ax in axes:
ax.legend().set_visible(False);
# graphics with subplots
table2 = pd.crosstab(df['Sport'], df['Medal'])
axes = table2.plot(figsize=(9, 6),
title="Medals by metal and sport",
kind='bar',
subplots=True,
#sharey=True,
color=['darkorange', 'silver', 'gold'],
rot=60)
plt.subplots_adjust(hspace=0.4)
plt.xticks(ha='right')
axes[-1].set_xticks(range(len(table2)))
axes[-1].set_xlabel("Sports")
axes[-1].set_xticklabels(table2.index)
for ax in axes:
ax.legend().set_visible(False);
FIne tunings of default matplotlib parameters can be achieved by modifying:
matplolib.rcParams
variablematplotlibrc
file, see matplotlib.matplotlib_fname()
.Of course, these repositories require expertise and attention when modifying them.
# path of matplotlibrc file
print(mpl.matplotlib_fname())
# load a set of data
geo = pd.read_csv('correspondance-code-insee-code-postal.csv', sep=';',
usecols=range(10),
index_col='Code INSEE')
geo[['Latitude', 'Longitude']] = geo['geo_point_2d'].str.extract('(.+), (.+)', expand=True).astype(float)
geo.head()
# scatter plot provides naive maps
plt.scatter(geo['Longitude'],
geo['Latitude'],
s=3);
# %load exercises/ex6.py
# Metropolitan France
metro = geo.loc[geo['Latitude'] > 40]
plt.scatter(metro['Longitude'],
metro['Latitude'],
s=3);
metro = metro.sort_values('Altitude Moyenne')
plt.scatter(metro['Longitude'],
metro['Latitude'],
c=metro['Altitude Moyenne'],
s=3,
cmap=plt.cm.coolwarm)
plt.colorbar();
The matplotlib
and seaborn
modules manage also colormaps, i.e. palettes of colors associated with discrete or continuous data:
matplotlib
, see: http://matplotlib.org/users/colormaps.htmlseaborn
, see: http://seaborn.pydata.org/tutorial/color_palettes.htmlThese modules manage also other palettes:
print(*plt.cm.datad.keys(), sep=' ')
# %load exercises/ex7.py
geo['Densité'] = geo['Population'] / geo['Superficie']
status = list(geo['Statut'].value_counts().index)
if pd.__version__ < '0.21.0':
geo['Statut'] = geo['Statut'].astype('category', categories=status, ordered=True)
else:
from pandas.api.types import CategoricalDtype
cat_status = CategoricalDtype(categories=status, ordered=True)
geo['Statut'] = geo['Statut'].astype(cat_status)
metro = geo.loc[geo['Latitude'] > 40]
# Noms des préfectures de région
plt.figure(figsize=(7, 5))
metro_A = metro.loc[metro["Statut"] >= "Préfecture"]
metro_A = metro_A.sort_values("Population", ascending=False)
metro_B = metro.loc[metro["Statut"] < "Préfecture"]
# communes
plt.scatter(metro_B["Longitude"],
metro_B["Latitude"],
c='y',
s=3,
edgecolors='none')
# préfectures
ax = plt.scatter(metro_A["Longitude"],
metro_A["Latitude"],
c=metro_A["Densité"],
s=metro_A["Population"],
cmap=plt.cm.Reds,
edgecolors='none')
# noms des préfectures de région hors PLM
metro_C = metro.loc[(metro["Statut"] >= "Préfecture de région") & ~metro["Commune"].str.contains("ARRONDISSEMENT")]
for i, row in metro_C.iterrows():
plt.text(row["Longitude"],
row["Latitude"],
row["Commune"].title(),
fontsize=8)
plt.colorbar(ax);
Seaborn is a Python data visualization library based on matplotlib. It provides a high-level interface for drawing attractive and informative statistical graphics.
Show point estimates and confidence intervals as rectangular bars.
# number of athletes for each sport, country, gender and medal
table = df.pivot_table(index=['Sport', 'NOC', 'Gender', 'Medal'], values='Athlete', aggfunc='count') #pd.Series.nunique)
table.reset_index(inplace=True)
sports = table['Sport'].value_counts().index[:10]
table = table.loc[table['Sport'].isin(sports)]
table
# barplot number of athletes by sport and medal
fig, ax = plt.subplots(figsize=(12, 8))
sns.barplot(y='Sport',
x='Athlete',
data=table,
hue='Medal',
palette=['darkorange', 'silver', 'gold'],
#ci=0,
ax=ax);
Draw a scatterplot where one variable is categorical.
# number of athlete by sport and gender
fig, ax = plt.subplots(figsize=(12, 8))
sns.stripplot(y='Sport',
x='Athlete',
data=table,
hue='Gender',
palette=['blue', 'red'],
jitter=True,
ax=ax);
Show the counts of observations in each categorical bin using bars.
# limit to main countries
countries = df['NOC'].value_counts().index[:20]
table = df.loc[df['NOC'].isin(countries)]
table
# countplot of medals by country
fig, ax = plt.subplots(figsize=(12, 5))
sns.countplot(x='NOC', data=table, ax=ax)
ax.set_xlabel('Country');
Flexibly plot a univariate distribution of observations.
# distplot + regplot of Altitude Moyenne
sns.distplot(geo['Altitude Moyenne']);
Flexibly plot a univariate distribution of observations.
# distplot + regplot of Superficie
hds = geo[geo['Département'] == 'HAUTS-DE-SEINE']
sns.distplot(hds['Superficie'], hist=False, rug=True);
Plot data and a linear regression model fit.
# regplot = scatter plot + linear regression
x = np.random.random(100)
y = x * (1 + np.random.random(100)) / 2
g = sns.regplot(x, y);
Plot rectangular data as a color-encoded matrix
# crosstab Gender by Edition
table1 = pd.crosstab(df['Gender'], df['Edition'])
sns.heatmap(table1, cmap='Blues');
# %load exercises/ex8.py
fig, ax = plt.subplots(figsize=(12, 2))
sns.heatmap(table1, cmap='Blues', ax=ax);
ax.set_xticklabels(table1.columns, rotation=60)
ax.set_yticklabels(table1.index, rotation=0)
ax.set_ylabel('Gender', rotation=0);
table2 = pd.crosstab(df['Sport'], df['Gender'])
table2.sort_values('Men', inplace=True)
fig, ax = plt.subplots(figsize=(5, 12))
sns.heatmap(table2, cmap='Blues', annot=True, fmt='d', ax=ax);
dfw = df.loc[df['Gender'] == 'Women']
table3 = pd.crosstab(dfw['Sport'], dfw['Edition']).apply(lambda x: x > 0).astype(int)
fig, ax = plt.subplots(figsize=(12, 8))
sns.heatmap(table3, cmap='Reds', cbar=False, ax=ax);