French Train Delays

This document is the result of a screencast where I translated an analysis from R into python.

The R analysis was done as part of an hour long screencast by Dave Robinson. I've reproduced his code in this document (using %%R cell magic), and link to the original code below.

| @machow translation screencast | @dgrtwo screencast | original analysis | Tidy Tuesday |

In [1]:
import rpy2
from qgrid import show_grid
import pandas as pd
from siuba import _, mutate, arrange, select, filter, count, group_by, summarize, ungroup

from plotnine import *
In [2]:
import rpy2
from qgrid import show_grid
%load_ext rpy2.ipython
/Users/machow/.virtualenvs/tidytuesday-py/lib/python3.6/site-packages/rpy2/robjects/pandas2ri.py:14: FutureWarning: pandas.core.index is deprecated and will be removed in a future version.  The public classes are available in the top-level namespace.
  from pandas.core.index import Index as PandasIndex

Read in data

In [3]:
full_trains = (
    pd.read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-02-26/full_trains.csv")
    >> mutate(
        pct_late_at_departure = _.num_late_at_departure / _.total_num_trips,
        arrival_station = _.arrival_station.str.title(),
        departure_station = _.departure_station.str.title(),
        date = lambda _: pd.to_datetime(_.year.astype(str) + _.month.apply("-{:02d}".format) + "-01")
    )
    >> arrange(_.departure_station, _.arrival_station, _.month)
    >> mutate(service = _.service.ffill())
)
In [4]:
%%R

library(tidyverse)
library(scales)
theme_set(theme_light())

full_trains <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-02-26/full_trains.csv") %>%
  mutate(pct_late_at_departure = num_late_at_departure / total_num_trips,
         arrival_station = str_to_title(arrival_station),
         departure_station = str_to_title(departure_station),
         date = as.Date(sprintf("%d-%02d-01", year, month))) %>%
  arrange(departure_station, arrival_station, month) %>%
  fill(service)
R[write to console]: ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──

R[write to console]:  ggplot2 3.3.0      purrr   0.3.3
 tibble  2.1.3      dplyr   0.8.5
 tidyr   1.0.2      stringr 1.4.0
 readr   1.3.1      forcats 0.5.0

R[write to console]: ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
 dplyr::filter() masks stats::filter()
 dplyr::lag()    masks stats::lag()

R[write to console]: 
Attaching package: ‘scales’


R[write to console]: The following object is masked from ‘package:purrr’:

    discard


R[write to console]: The following object is masked from ‘package:readr’:

    col_factor


R[write to console]: Parsed with column specification:
cols(
  .default = col_double(),
  service = col_character(),
  departure_station = col_character(),
  arrival_station = col_character(),
  comment_cancellations = col_logical(),
  comment_delays_at_departure = col_logical(),
  comment_delays_on_arrival = col_character()
)

R[write to console]: See spec(...) for full column specifications.

⚠️: difference between pd.Series.where and np.where

  • Series.where(cond, else_value) - keeps values where cond is true, otherwise uses else_value (default NA)
  • np.where(cond) - gives position numbers where cond is True
  • np.where(cond, if_value, else_value) - sets if_value where cond is True, otherwise else_value.
In [5]:
import numpy as np

full_trains.index.where(full_trains.index == 3874)

np.where(full_trains.index == 3874)
Out[5]:
(array([3]),)
In [6]:
#show_grid(full_trains, grid_options = {'forceFitColumns': False})

Histogram of percentage late for November, 2018

In [7]:
percent_format = lambda l: ["{:.0f}%".format(v * 100) for v in l]

november_2018 = (
    full_trains
    >> filter(_.year == 2018, _.month == 11)
)

(november_2018
  >> ggplot(aes("pct_late_at_departure")) 
   + geom_histogram(binwidth = .05)
   + scale_x_continuous(labels=lambda l: ["{:.0f}%".format(v * 100) for v in l])
)
Out[7]:
<ggplot: (-9223372036545809790)>
In [8]:
%%R

november_2018 <- full_trains %>%
  filter(year == 2018, month == 11)

november_2018 %>%
  ggplot(aes(pct_late_at_departure)) +
  geom_histogram(binwidth = .05) +
  scale_x_continuous(labels = percent_format())

Boxplot of percent late at departure by departure station

✏️: what is a Categorical series?

These are often used by plotnine and ggplot to reorder legends in plots!

Here is an example.

In [9]:
# what is a categorical
cat = pd.Series(['a', 'a', 'b', 'b', 'c'], dtype = "category")

print(cat.cat.categories)     # unique values (also called levels)
print(cat.cat.codes)          # mapping onto categories
Index(['a', 'b', 'c'], dtype='object')
0    0
1    0
2    1
3    1
4    2
dtype: int8
In [10]:
from siuba.dply.forcats import fct_lump

(
  november_2018
  >> mutate(departure_station = fct_lump(_.departure_station, 3))
  >> ggplot(aes("departure_station", "pct_late_at_departure"))
  + geom_boxplot()
  + scale_y_continuous(labels = percent_format)
)
Out[10]:
<ggplot: (-9223372036539746479)>
In [11]:
%%R

november_2018 %>%
  mutate(departure_station = fct_lump(departure_station, 3)) %>%
  ggplot(aes(departure_station, pct_late_at_departure)) +
  geom_boxplot() +
  scale_y_continuous(labels = percent_format())

Heat map of late trains based on departing and arriving stations

In [12]:
from siuba.dply.forcats import fct_reorder

(
    november_2018
    >> mutate(
        arrival_station=fct_reorder(
            fct_lump(_.arrival_station, n=14), _.pct_late_at_departure
        )
    )
    >> mutate(
        departure_station=fct_reorder(
            fct_lump(_.departure_station, n=14), _.pct_late_at_departure
        )
    )
    >> group_by(_.arrival_station, _.departure_station)
    >> summarize(
        pct_late_at_departure=_.num_late_at_departure.sum(skipna=True)
        / _.total_num_trips.sum(skipna=True)
    )
    >> ggplot(aes("arrival_station", "departure_station", fill="pct_late_at_departure"))
    + geom_tile()
    + scale_fill_gradient2(low="blue", high="red", midpoint=0.25, labels=percent_format)
    + theme(axis_text_x=element_text(angle=90, hjust=1))
    + labs(
        x="Arrival station",
        y="Departure station",
        fill="% late at departure",
        title="Which routes have the most delayed trains in November 2018?",
        subtitle="Stations with only one arriving/departing route were lumped into 'Other'",
    )
)
Out[12]:
<ggplot: (308973516)>
In [13]:
%%R

november_2018 %>%
#  mutate(arrival_station = fct_infreq(fct_lump(arrival_station, prop = .01))) %>%
#  mutate(departure_station = fct_infreq(fct_lump(departure_station, prop = .01))) %>%
  mutate(arrival_station = fct_reorder(fct_lump(arrival_station, prop = .01), pct_late_at_departure)) %>%
  mutate(departure_station = fct_reorder(fct_lump(departure_station, prop = .01), pct_late_at_departure)) %>%
  group_by(arrival_station, departure_station) %>%
  summarize(pct_late_at_departure = sum(num_late_at_departure) / sum(total_num_trips)) %>%
  ggplot(aes(arrival_station, departure_station, fill = pct_late_at_departure)) +
  geom_tile() +
  scale_fill_gradient2(low = "blue", high = "red", midpoint = .25, labels = percent_format()) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  labs(x = "Arrival station",
       y = "Departure station",
       fill = "% late at departure",
       title = "Which routes have the most delayed trains in November 2018?",
       subtitle = "Stations with only one arriving/departing route were lumped into 'Other'")

Percentage late over time

In [14]:
(
    full_trains
    >> filter(_.departure_station == "Lyon Part Dieu")
    >> ggplot(aes("date", "pct_late_at_departure", color="arrival_station"))
    + geom_line()
    + scale_y_continuous(labels=percent_format)
    + expand_limits(y=0)
)
Out[14]:
<ggplot: (312797879)>
In [15]:
%%R
full_trains %>%
  filter(departure_station == "Lyon Part Dieu") %>%
  ggplot(aes(date, pct_late_at_departure, color = arrival_station)) +
  geom_line() +
  scale_y_continuous(labels = percent_format()) +
  expand_limits(y = 0)

Summarizing departure station by month

In [16]:
num_cols = full_trains.columns[full_trains.columns.str.contains("num")]

summarize_op = {colname: _[colname].sum() for colname in num_cols}
In [17]:
from siuba.experimental.pd_groups import fast_summarize

# create our own last function
last = lambda ser: ser.iloc[-1]

by_departure_station_month = (
    full_trains
    >> group_by(departure_station=fct_lump(_.departure_station, n=13), date=_.date)
    >> summarize(**summarize_op)
    >> ungroup()
    >> mutate(pct_late_at_departure=_.num_late_at_departure / _.total_num_trips)
)

(
    by_departure_station_month
    >> mutate(
        departure_station=fct_reorder(
            _.departure_station, -_.pct_late_at_departure, last
        )
    )
    >> ggplot(aes("date", "pct_late_at_departure", color="departure_station"))
    + geom_line()
    + scale_y_continuous(labels=percent_format)
    + labs(x="Month", y="% late at departure", color="Departure station")
)
Out[17]:
<ggplot: (315681782)>
In [18]:
%%R
by_departure_station_month <- full_trains %>%
  group_by(departure_station = fct_lump(departure_station, prop = .01),
           date) %>%
  summarize_at(vars(contains("num")), sum) %>%
  ungroup() %>%
  mutate(pct_late_at_departure = num_late_at_departure / total_num_trips)

by_departure_station_month %>%
  mutate(departure_station = fct_reorder(departure_station, -pct_late_at_departure, last)) %>%
  ggplot(aes(date, pct_late_at_departure, color = departure_station)) +
  geom_line() +
  scale_y_continuous(labels = percent_format()) +
  labs(x = "Month",
       y = "% late at departure",
       color = "Departure station")

Similar heat map over time

✏️: To convert a number to month name, use the Series.dt.month_name() method

In [19]:
from siuba import if_else

summarize_op = {colname: _[colname].sum() for colname in num_cols}


by_departure_station_month = (
    full_trains
    >> group_by(
        departure_station=if_else(
            _.service == "International",
            _.departure_station + " (International)",
            _.departure_station,
        ),
        service=_.service,
        year=_.year,
        month=fct_reorder(_.date.dt.month_name(), _.month),
    )
    >> summarize(**summarize_op)
    >> ungroup()
    >> mutate(pct_late_at_departure=_.num_late_at_departure / _.total_num_trips)
)

(
    by_departure_station_month
    >> mutate(
        departure_station=fct_reorder(
            _.departure_station,
            (_.service != "International") + _.pct_late_at_departure,
            np.mean,
        )
    )
    >> ggplot(aes("month", "departure_station", fill="pct_late_at_departure"))
    + geom_tile()
    + scale_fill_gradient2(low="blue", high="red", midpoint=0.25, labels=percent_format)
    + facet_wrap("~ year", nrow=1, scales="free_x")
    + theme(
        axis_text_x=element_text(angle=90, hjust=1),
        axis_ticks=element_blank(),
        panel_grid=element_blank(),
    )
    + labs(fill="% late at departure")
    + labs(
        x="Month",
        y="Departure station",
        title="Which stations had delays in which months?",
        subtitle="Ordered by the average delay, with international routes on the bottom",
    )
)
Out[19]:
<ggplot: (-9223372036539048988)>
In [20]:
%%R

by_departure_station_month <- full_trains %>%
  group_by(departure_station = ifelse(service == "International",
                                      paste0(departure_station, " (International)"),
                                      departure_station),
           service,
           year,
           month = fct_reorder(month.name[month], month)) %>%
  summarize_at(vars(contains("num")), sum) %>%
  ungroup() %>%
  mutate(pct_late_at_departure = num_late_at_departure / total_num_trips)

by_departure_station_month %>%
  mutate(departure_station = fct_reorder(departure_station, (service != "International") + pct_late_at_departure, mean)) %>%
  ggplot(aes(month, departure_station, fill = pct_late_at_departure)) +
  geom_tile() +
  scale_fill_gradient2(low = "blue", high = "red", midpoint = .25, labels = percent_format()) +
  facet_wrap(~ year, nrow = 1, scales = "free_x") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1),
        axis.ticks = element_blank(),
        panel.grid = element_blank()) +
  labs(fill = "% late at departure") +
  labs(x = "Month",
       y = "Departure station",
       title = "Which stations had delays in which months?",
       subtitle = "Ordered by the average delay, with international routes on the bottom")

Include code for tabs

In [21]:
def javascript(*st,file=None):
    from IPython.display import display, HTML
    if len(st) == 1 and file is None:
        s = st[0]
    elif len(st) == 0 and file is not None:
        s = open(file).read()
    else:
        raise ValueError('Pass either a string or file=.')
    display(HTML("<script type='text/javascript'>" + s + "</script>"))
    
javascript(file = "templates/puretabs.js")
In [22]:
%%html
<script>
  window.onload = function() {
    //pureTabs.init();
    //pureTabs.init('tabs', 'tabs--active');
    pureTabs.init('pytabs-1', 'tabs__link--active');
    pureTabs.init('pytabs-2', 'tabs__link--active');      
    pureTabs.init('pytabs-2', 'tabs__link--active');      
    pureTabs.init('pytabs-3', 'tabs__link--active');      
    pureTabs.init('pytabs-4', 'tabs__link--active');      
    pureTabs.init('pytabs-5', 'tabs__link--active');      
    pureTabs.init('pytabs-6', 'tabs__link--active');      
    pureTabs.init('pytabs-7', 'tabs__link--active');      
    pureTabs.init('pytabs-8', 'tabs__link--active');      
    pureTabs.init('pytabs-9', 'tabs__link--active');      

  }
</script>