import pandas as pd
from siuba import *
from siuba.dply.vector import row_number, n
from plotnine import *Golden Age of Television Analysis
tv_ratings = pd.read_csv(
"https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-08/IMDb_Economist_tv_ratings.csv",
parse_dates = ["date"]
)Glance at data for a single show
tv_ratings >> filter(_, _.title.str.contains("Buffy"))| titleId | seasonNumber | title | date | av_rating | share | genres | |
|---|---|---|---|---|---|---|---|
| 275 | tt0118276 | 1 | Buffy the Vampire Slayer | 1997-04-14 | 7.9629 | 11.70 | Action,Drama,Fantasy |
| 276 | tt0118276 | 2 | Buffy the Vampire Slayer | 1997-12-31 | 8.4191 | 19.41 | Action,Drama,Fantasy |
| 277 | tt0118276 | 3 | Buffy the Vampire Slayer | 1999-01-29 | 8.6233 | 17.12 | Action,Drama,Fantasy |
| 278 | tt0118276 | 4 | Buffy the Vampire Slayer | 2000-01-19 | 8.2205 | 16.19 | Action,Drama,Fantasy |
| 279 | tt0118276 | 5 | Buffy the Vampire Slayer | 2001-01-12 | 8.3028 | 11.99 | Action,Drama,Fantasy |
| 280 | tt0118276 | 6 | Buffy the Vampire Slayer | 2002-01-29 | 8.1008 | 8.45 | Action,Drama,Fantasy |
| 281 | tt0118276 | 7 | Buffy the Vampire Slayer | 2003-01-18 | 8.0460 | 9.89 | Action,Drama,Fantasy |
Count season number
(tv_ratings
>> count(_, _.seasonNumber)
>> ggplot(aes("seasonNumber", "n"))
+ geom_col()
+ labs(
title = "Season Number Frequency",
x = "season number",
y = "count"
)
)
<ggplot: (8742457309675)>
Average rating throughout season
(tv_ratings
>> filter(_, _.seasonNumber <= 7)
>> group_by(_, _.seasonNumber)
>> summarize(_, av_rating = _.av_rating.mean())
>> ggplot(aes("seasonNumber", "av_rating"))
+ geom_line()
+ labs(
title = "Average rating across seasons",
x = "season number",
y = "average rating"
)
)
<ggplot: (8742457244853)>
Shows with most variable ratings
Filter down
by_show = (tv_ratings
>> group_by(_, "title")
>> summarize(_,
avg_rating = _.av_rating.mean(),
sd = _.av_rating.std(),
seasons = n(_)
)
>> arrange(_, -_.avg_rating)
)
most_variable_shows = (by_show
>> filter(_, _.seasons >= 5)
>> arrange(_, -_.sd)
>> head(_, 6)
)
most_variable_shows| title | avg_rating | sd | seasons | |
|---|---|---|---|---|
| 49 | Are You Afraid of the Dark? | 8.422971 | 1.390834 | 7 |
| 263 | Friday Night Lights | 8.085020 | 0.749403 | 5 |
| 650 | The 100 | 8.314140 | 0.708071 | 5 |
| 582 | Scrubs | 8.236744 | 0.702544 | 9 |
| 195 | Dexter | 8.582400 | 0.694169 | 8 |
| 562 | Roseanne | 7.332537 | 0.670299 | 8 |
Plot show ratings
(tv_ratings
>> inner_join(_, most_variable_shows, "title")
>> ggplot(aes("seasonNumber", "av_rating", color = "title"))
+ geom_line()
+ geom_point()
+ scale_x_continuous(breaks = range(11))
+ facet_wrap("~ title")
+ theme(legend_position = "none")
+ labs(
x = "season number",
y = "average rating"
)
)
<ggplot: (8742455023836)>