# PyParis logo
from IPython.display import Image
Image("PyParis.png")
Expertise & IT Audit - Data Science - Python
Python 3.5+ : numpy, pandas, matplotlib
Datasets :
References
# imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
# magic command to display graphics within the notebook
%matplotlib inline
# display options
from pandas import set_option
set_option("display.max_rows", 16)
set_option("precision", 3)
plt.style.use('seaborn-notebook')
# loads US data from last year
path = "datasets/names/"
filename = "yob2016.txt"
us2016 = pd.read_csv(os.path.join(path, filename),
header=None,
names=['name', 'gender', 'births'])
us2016
We obtain a Pandas DataFrame object which represents 2D data. The info() method provides the main characteritics of the DataFrame.
# characteristics of a DataFrame
us2016.info()
# access to the index
us2016.index
# acces to columns
us2016.columns
# acces to dimensions
us2016.shape
We can access to a specific column by using the [] operator and obtain a Pandas Series object which represents 1D data. It shares the original DataFrame index.
# access to a specific column
us2016['name'] # us2016.name
# number of US unique names
us2016['name'].nunique()
Few operations on DataFrame: transposition, sorting and grouping
Transposition reverse rows and columns and switch index and column labels : transpose() or T method.
# transposition T or transpose()
us2016.T
It is possible to sort a DataFrame by one or more columns and specifying the ascending : sort_values() method.
# top 10 US names in 2016
us2016.sort_values('births', ascending=False).head(10)
DataFrame are provided with a powerful groupby() method which splits it into chuncks - according to the values of one or more columns - and then apply a summarization function to each chunk and concatenates the whole in a new DataFrame.
This illustrates the split-apply-combine strategy : groupby() then sum() methods.
# US female and male births of year 2016
us2016.groupby('gender').sum()
We load and merge all CSV files within the folder names. The year is extracted from file names and added as a new column. We use the append() method which concatenates 2 DataFrames.
# load all data in a single DataFrame
us = pd.DataFrame()
for root, dirs, files in os.walk(path):
for filename in files:
if filename[0:3] != "yob":
continue
csv = pd.read_csv(os.path.join(root, filename),
header=None,
names=['name', 'gender', 'births'])
year = int(filename[3:7]) # yobYYYY.txt
csv['year'] = year
us = us.append(csv, ignore_index=True)
us = us[['year', 'name', 'gender', 'births']] # fancy indexing in order to reorder the columns
us
# characteristics of a DataFrame
us.info()
# number of US unique names
us['name'].nunique()
# top 10 global US names within a year
us.sort_values('births', ascending=False).head(10)
# global US female and male births
us.groupby('gender').sum()['births'] [['births']]
What are the top 10 US names all years? We are going to implement a function so that it can be reused later.
# top (10) names all years
def top(df, size=10):
return df.groupby(['name', 'gender']).sum()['births'].sort_values(ascending=False).head(size)
# global US top 10 names all years
top(us)
Firstly, we load the French data in order to have a look at them.
# load
fr = pd.read_csv("datasets/nat2015.txt",
sep='\t',
encoding='latin-1',
header=0,
names = ['gender', 'name', 'year', 'births'])
fr = fr[['year', 'name', 'gender', 'births']]
fr
Secondly, we reload the French data so that they have the same characteristics as the US ones :
# load French data
fr = pd.read_csv("datasets/nat2015.txt",
sep='\t',
encoding='latin-1',
header=0,
names = ['gender', 'name', 'year', 'births'],
converters = {'gender': lambda x: 'M' if x == '1' else 'F',
'name': lambda x: '-'.join(map(lambda y: y.capitalize(), x.split('-'))),
'births': lambda x: int(float(x))})
fr = fr[(fr['year'] != 'XXXX') &
(fr['name'] != '_prenoms_rares') &
(fr['name'].str.len() != 1)]
fr['year'] = fr['year'].astype(int)
fr = fr[['year', 'name', 'gender', 'births']]
fr.sort_values(['year', 'gender', 'births', 'name'], ascending=[True, True, False, True], inplace=True)
fr.reset_index(drop=True, inplace=True)
fr
# characteristics of a DataFrame
fr.info()
# top 10 global French names within a year
fr.sort_values('births', ascending=False).head(10)
# global French female and male births
fr.groupby('gender').sum()['births']
# global French top 10 names all years
top(fr)
# making a boolean mask
mask = us['name'].str.len() == 2
mask
# using the mask to select data
us[mask]
We implement a function which perform all necessary operations and run US and French data using the unique() method.
# searching for names with 2 letters only
def names_of_length_2(df):
mask = df['name'].str.len() == 2
return df[mask]['name'].unique()
# US names with 2 letters
names_of_length_2(us)
# French names with 2 letters
names_of_length_2(fr)
Now, we implement a function which looks for names for which births within a year are equal to the year.
# looking for names for wich births within a year are equal to the year
def births_is_year(df):
return df[df['births'] == df['year']]
# US names with births equals year
births_is_year(us)
# French names with briths equals year
births_is_year(fr)
Here, we implement a function which search for prefix names.
First, we select the names starting with a specific prefix using the str operator and the startswith() method.
# select US names starting with 'Fran'
selection = us[us['name'].str.startswith("Fran")]
selection
Second, we can reuse the top function and implement a new function.
# looking for prefix
def look(df, prefix, **kwargs):
selection = df[df['name'].str.startswith(prefix)]
return top(selection, **kwargs)
# US names starting with "Fran"
look(us, "Fran")
# French names starting with "Fran"
look(fr, "Fran")
We can see a French compound name. What are the top French compound name?
# global French top 10 compound names all years
selection = fr[fr['name'].str.contains('-')]
top(selection)
Which French names are involved in a compound name?
# selection of data related to Kevin (M)
selection = us[(us['name'] == "Kevin") & (us['gender'] == "M")]
selection
Second, we switch the year column to be the index of the DataFrame.
# same with year as index
selection_years = selection.set_index('year')
selection_years
There is a strong connexion between Pandas DataFrame and Matplotlib. We can directly use the Matplotlib plot() function which applied to DataFrame or Series as a method.
# plot the births column along with the years
selection_years.plot(title="Births of Kevin (M) over years");
We can put every thing together in a single function and run the French data.
# evolution of a name (+ gender)
def plot_name_gender(df, name, gender):
selection = df[(df['name'] == name) & (df['gender'] == gender)]
selection_years = selection.set_index('year')
selection_years.plot(title="Births of {} ({}) over years".format(name, gender));
# evolution of French Kevin (M)
plot_name_gender(fr, 'Kevin', 'M')
We will try to find later correlations between US and French names over time.
DataFrame are provided with another powerful method pivot_table(). It produces a new DataFrame with as the index the different values of a first column, as the column index the different values of second column and the values of a third column are summarized into cells by using an aggregative function.
We use here this method to obtain the births by gender over years.
# pivoting US data by years and gender and computing the sum of births
genders = us.pivot_table(index='year',
columns='gender',
values='births',
aggfunc=np.sum)
genders
# plot number of births by gender over years
genders.plot(title="Total births by gender over years", color=['r', 'b']);
We can put every thing together in a single function and run the French data.
# plot number of births by gender over years
def plot_births(df):
table = df.pivot_table(index='year',
columns='gender',
values='births',
aggfunc=np.sum)
table.plot(title="Total births by gender over years", color=['r', 'b']);
# plot French births by gender over years
plot_births(fr)
We are using the crosstab() method in order to compute the number of different names by gender over years. In the 1880' parents had roughly 1000 names per gender to name their babies. Nowadays, parents have 14 to 19 thousands!
# number of different names by gender over years
diversity = pd.crosstab(us['year'], us['gender'])
diversity.head()
# nowadays
diversity.tail()
We can put every thing together in a single function.
# plot evolution of number of names over years
def plot_diversity(df):
diversity = pd.crosstab(df['year'], df['gender'])
diversity.plot(title="Evolution of names diversity by gender over years", color=['r', 'b']);
# US diversity
plot_diversity(us)
# French diversity
plot_diversity(fr)
We will now add a column with the last letter of each name and compute the evolution of the most frequent last letters which appears nowadays. This is computed by using the apply() method and a lambda function which extracts the last letter and put it in uppercase for readability.
# add a column with the last later of names
us["last"] = us["name"].apply(lambda x: x[-1].upper())
us
Then we use the crosstab() function in order to get the number of births by last letters over years.
# diversity of names by last letter over years
last = pd.crosstab(us['year'], us['last'])
last
Then we sort the letters of the last available year. We use the .iloc[] operator which enables to select rows, columns and parts of a DataFrame by indices.
# sort most used last letters for US names in 2015
last.iloc[-1].sort_values(ascending=False)
Here we put the 7 top letters in an array.
# put the 7 letters in an array
letters = last.iloc[-1].sort_values(ascending=False).index[:7]
letters
We can perform a fancy indexing on the DataFrame with the 7 letters and perform a plot.
# last letters DataFrame limited to 7 top last letters of 2015
last[letters]
We can put every thing together in a single function and run the French data.
# plot the evolution of diversity of names according to their last letter
def plot_last_letters(df):
df["last"] = df["name"].apply(lambda x: x[-1].upper())
last = pd.crosstab(df['year'], df['last'])
letters = last.iloc[-1].sort_values(ascending=False).head(7).index
last[letters].plot(title="Evolution of diversity of names according to their last letter");
# plot the evolution of diversity of US names according to their last letter
plot_last_letters(us)
# plot the evolution of diversity of US names according to their last letter
plot_last_letters(fr)
# interactive scatter plot of last letters for US and French names across years
from ipywidgets import interact, IntSlider
# prepare and normalize data
last_us = pd.crosstab(us['year'], us['last'])
last_us = last_us.div(last_us.sum(axis=1), axis=0)
last_fr = pd.crosstab(fr['year'], fr['last'])
last_fr = last_fr.div(last_fr.sum(axis=1), axis=0)
def plot_letters(year):
data = pd.concat([last_us.loc[year], last_fr.loc[year]], axis=1, join='inner')
plt.scatter(data.iloc[:,0], data.iloc[:,1])
for (x, y, letter) in zip(data.iloc[:,0], data.iloc[:,1], last.columns):
plt.text(x + 0.003, y + 0.003, letter)
interact(plot_letters, year=IntSlider(min=1900, max=2015, step=1, value=1900));
# find top names
top_names = us.groupby('name').sum()['births']
top_names = top_names.sort_values(ascending=False)
top_names = top_names[top_names >= 10000]
top_names
Select those names in the US DataFrame using the isin() method.
# data for top names
selection = us[us['name'].isin(top_names.index)]
selection
Secondly, we pivot the data so as to have both genders on each line sorted by names and the values are the sum of births per name and gender. We add a column with the ratio F vs M.
# pivot by name and gender with the sum of births
ratio_name = selection.pivot_table(index='name',
columns='gender',
values='births',
aggfunc=np.sum)
ratio_name["ratio"] = ratio_name["F"] / (ratio_name["F"] + ratio_name["M"])
ratio_name
We can now plot the distribution of the ratio among names by using the plot() function with the option kind='hist'. We notice that most of names are mostly either female or male, but some names are possibily ambivalent.
# distribution of F vs M ratio for top 1000 US names
ratio_name['ratio'].plot(kind='hist');
We put everything in a function so that we can process French data.
# plot distribution of F vs M ratio for top 1000 names
def plot_distribution(df):
top_names = df.groupby('name').sum()['births']
top_names = top_names.sort_values(ascending=False)
top_names = top_names[top_names >= 10000]
selection = df[df['name'].isin(top_names.index)]
ratio_name = selection.pivot_table(index=['name'],
columns='gender',
values='births',
aggfunc=np.sum)
ratio_name["ratio"] = ratio_name["F"] / (ratio_name["F"] + ratio_name["M"])
ratio_name['ratio'].plot(kind='hist');
# distribution of F vs M ratio for top 1000 French names
plot_distribution(fr)
First we select the data related to a single name, for instance Leslie.
# select US data for Leslie
selection = us[us["name"] == "Leslie"]
selection
Then we compute the evolution of births for this name by gender over years. Note that we could have reuse the selection performed above in ratio_gender.
# evolution of births by gender
evolution_gender = selection.pivot_table(index='year',
columns='gender',
values='births')
evolution_gender
Then we compute the F and M ratio by dividing the births by the sum of each line (axis=1) along the index (axis=0).
# gender evolution over years
evolution_gender = evolution_gender.div(evolution_gender.sum(axis=1), axis=0)
evolution_gender
Put everything in a function and run over US and French data.
# plot gender ratio evolution of years
def plot_transname(df, name):
selection = df[df["name"] == name]
gender = selection.pivot_table(index='year',
columns='gender',
values='births')
gender = gender.div(gender.sum(axis=1), axis=0)
gender.plot(title="Evolution proportion of {} by gender over years".format(name), style={'M': 'b', 'F': 'r'});
# gender evolution of US Donnie
plot_transname(us, "Leslie")
# gender evolution of US Donnie
plot_transname(us, "Donnie")