imdb_data
imdb_data.RmdLoad Packages and Data
library(tmdbdata)
library(dplyr)
library(tidyr)
library(forcats)
library(ggplot2)
library(ggridges)
library(scales)
imdb_data <- load_imdb_data()
#> creating ./cache_imdb/ directory
#> ℹ Download https://datasets.imdbws.com/title.basics.tsv.gz
#> ✔ Download https://datasets.imdbws.com/title.basics.tsv.gz - 196.5 MB [1s]
#>
#> ℹ Unzip .gz file
#> ✔ Unzip .gz file [3.5s]
#>
#> ℹ Read .tsv file
#> ℹ Read .tsv file - 11,662,763 rows x 9 columns
#> ✔ Read .tsv file - 11,662,763 rows x 9 columns [23.7s]
#>
#> ℹ Save ./cache_imdb/title_basics.fst
#> ℹ Save ./cache_imdb/title_basics.fst - 772.4 MB.
#> ✔ Save ./cache_imdb/title_basics.fst - 772.4 MB. [2.7s]
#>
#> ℹ Download https://datasets.imdbws.com/title.ratings.tsv.gz
#> ✔ Download https://datasets.imdbws.com/title.ratings.tsv.gz - 7.5 MB [145ms]
#>
#> ℹ Unzip .gz file
#> ✔ Unzip .gz file [157ms]
#>
#> ℹ Read .tsv file
#> ✔ Read .tsv file - 1,568,304 rows x 3 columns [386ms]
#>
#> ℹ Save ./cache_imdb/title_ratings.fst
#> ✔ Save ./cache_imdb/title_ratings.fst - 22.3 MB. [96ms]
#>
#> ℹ Download https://datasets.imdbws.com/title.episode.tsv.gz
#> ℹ Download https://datasets.imdbws.com/title.episode.tsv.gz - 47.2 MB
#> ✔ Download https://datasets.imdbws.com/title.episode.tsv.gz - 47.2 MB [218ms]
#>
#> ℹ Unzip .gz file
#> ✔ Unzip .gz file [830ms]
#>
#> ℹ Read .tsv file
#> ℹ Read .tsv file - 8,974,569 rows x 4 columns
#> ✔ Read .tsv file - 8,974,569 rows x 4 columns [3.2s]
#>
#> ℹ Save ./cache_imdb/title_episode.fst
#> ✔ Save ./cache_imdb/title_episode.fst - 204.4 MB. [876ms]
#>
#> ℹ Combining data from IMDB tables...
#> • TV Episodes: 797,576 rows x 12 columns
#> • Rated Media: 1,549,078 rows x 9 columns
#> ℹ Save ./cache_imdb/imdb_tv_episodes.fst
#> ✔ Save ./cache_imdb/imdb_tv_episodes.fst - 62.4 MB. [227ms]
#>
#> ℹ Save ./cache_imdb/imdb_rated_media.fst
#> ✔ Save ./cache_imdb/imdb_rated_media.fst - 79.7 MB. [355ms]
#>
rated_media <- imdb_data$rated_media %>% mutate(runtimeMinutes = as.double(runtimeMinutes))
tv_episodes <- imdb_data$tv_episodesDetermine data structure
rated_media %>% glimpse()
#> Rows: 1,549,078
#> Columns: 9
#> $ titleType <chr> "movie", "movie", "movie", "movie", "tvSeries", "movie"…
#> $ title <chr> "The Shawshank Redemption", "The Dark Knight", "Incepti…
#> $ averageRating <dbl> 9.3, 9.0, 8.8, 8.8, 9.2, 8.8, 8.7, 8.9, 9.5, 8.7, 9.2, …
#> $ numVotes <int> 3043798, 3020785, 2683341, 2465082, 2433513, 2379234, 2…
#> $ genres <chr> "Drama", "Action,Crime,Drama", "Action,Adventure,Sci-Fi…
#> $ runtimeMinutes <dbl> 142, 152, 148, 139, 60, 142, 169, 154, 45, 136, 175, 17…
#> $ startYear <int> 1994, 2008, 2010, 1999, 2011, 1994, 2014, 1994, 2008, 1…
#> $ endYear <int> NA, NA, NA, NA, 2019, NA, NA, NA, 2013, NA, NA, NA, NA,…
#> $ tconst <chr> "tt0111161", "tt0468569", "tt1375666", "tt0137523", "tt…Number of observations
count_by_type <- rated_media %>%
group_by(titleType) %>%
summarise(count = n()) %>%
arrange(desc(count)) %>%
mutate(count = count %>%
prettyNum(big.mark = ","))
count_by_type
#> # A tibble: 10 × 2
#> titleType count
#> <chr> <chr>
#> 1 tvEpisode 797,576
#> 2 movie 322,522
#> 3 short 158,491
#> 4 tvSeries 104,019
#> 5 video 56,613
#> 6 tvMovie 55,407
#> 7 tvMiniSeries 20,500
#> 8 videoGame 18,203
#> 9 tvSpecial 13,307
#> 10 tvShort 2,440
# Reorder titleTypes for future plots
rated_media$titleType <- factor(rated_media$titleType,
levels = count_by_type$titleType)Rating vs Votes
rated_media %>%
ggplot(aes(x=numVotes, y=averageRating)) +
geom_bin_2d()+
scale_x_log10(labels = label_comma()) +
scale_y_continuous(breaks = seq(0,10,2))+
scale_fill_viridis_c(option = 'mako',
labels = label_comma(),
transform = transform_log10()) +
labs(title = "Rating vs #Votes") +
facet_wrap(~ titleType,
ncol = 3)
Rating vs Runtime
rated_media %>%
filter(runtimeMinutes < 3*60) %>%
ggplot(aes(x=runtimeMinutes, y=averageRating)) +
geom_bin_2d()+
scale_x_continuous(breaks = seq(0,300,60),
labels = paste0(0:5,"h"),
name = "Runtime (Hours)") +
scale_y_continuous(breaks = seq(0,10,2))+
scale_fill_viridis_c(option = 'rocket',
labels = label_comma(),
transform = transform_log10()) +
labs(title = "Rating vs Runtime") +
facet_wrap(~ titleType,
ncol = 3)
Distrubutions of parameteres
We can see vertical ‘stripes’ of data in above plot, suggesting that the underlying distributions may be of interest.
Runtime
rated_media %>%
mutate(titleType = fct_rev(as_factor(titleType))) %>%
filter(runtimeMinutes < 3*60) %>%
ggplot(aes(x=runtimeMinutes, y=titleType, fill = titleType)) +
geom_density_ridges(aes(scale = 1.5),
show.legend = F)+
scale_x_continuous(breaks = seq(0,300,60),
labels = paste0(0:5,"h"),
name = "Runtime (Hours)") +
labs(title = "Runtimes") +
theme_minimal()
- Many
tvEpisodes are either
30mins, 45mins, or
1hr
+ Similar trend in tvSeries - Most
movies are ~1:30 ± 30mins
- shorts and
tvShorts don’t normally exceed 30
minutes
Release Year
rated_media %>%
mutate(titleType = (as_factor(titleType))) %>%
filter(startYear > 1950) %>%
ggplot(aes(x=startYear,
y=titleType,
fill = titleType)) +
geom_density_ridges(aes(scale = 1.3),
show.legend = F)+
scale_x_continuous(breaks = seq(1950,2020,10),
name = "Release Year") +
labs(title = "Release Year") +
theme_minimal()
TV Episodes
Top series (by #votes)
rated_media %>%
filter(grepl('Series',titleType)) %>%
select(tconst, everything(), -titleType, titleType) %>%
head(80)
#> tconst title averageRating numVotes
#> <char> <char> <num> <int>
#> 1: tt0944947 Game of Thrones 9.2 2433513
#> 2: tt0903747 Breaking Bad 9.5 2330915
#> 3: tt4574334 Stranger Things 8.6 1445426
#> 4: tt0108778 Friends 8.9 1144283
#> 5: tt1520211 The Walking Dead 8.1 1141861
#> 6: tt1475582 Sherlock 9.0 1048301
#> 7: tt7366338 Chernobyl 9.3 949728
#> 8: tt0898266 The Big Bang Theory 8.1 907924
#> 9: tt0773262 Dexter 8.6 843776
#> 10: tt1190634 The Boys 8.6 777289
#> 11: tt0386676 The Office 9.0 775224
#> 12: tt0460649 How I Met Your Mother 8.3 759244
#> 13: tt3032476 Better Call Saul 9.0 731601
#> 14: tt2442560 Peaky Blinders 8.7 711570
#> 15: tt2356777 True Detective 8.9 702963
#> 16: tt2085059 Black Mirror 8.7 693305
#> 17: tt10919420 Squid Game 8.0 663222
#> 18: tt3581920 The Last of Us 8.6 655142
#> 19: tt2861424 Rick and Morty 9.1 649925
#> 20: tt0411008 Lost 8.3 641206
#> 21: tt0455275 Prison Break 8.3 627652
#> 22: tt8111088 The Mandalorian 8.6 625011
#> 23: tt10048342 The Queen's Gambit 8.5 617322
#> 24: tt2306299 Vikings 8.5 617302
#> 25: tt2560140 Attack on Titan 9.1 610229
#> 26: tt5180504 The Witcher 7.9 595564
#> 27: tt6468322 Money Heist 8.2 571214
#> 28: tt0185906 Band of Brothers 9.4 561911
#> 29: tt0412142 House 8.7 558416
#> 30: tt0475784 Westworld 8.4 548013
#> 31: tt1856010 House of Cards 8.6 546925
#> 32: tt0141842 The Sopranos 9.2 524569
#> 33: tt1442437 Modern Family 8.5 515904
#> 34: tt1632701 Suits 8.4 515130
#> 35: tt0460681 Supernatural 8.4 511869
#> 36: tt3322312 Daredevil 8.6 505631
#> 37: tt2707408 Narcos 8.7 501151
#> 38: tt5753856 Dark 8.7 491627
#> 39: tt11198330 House of the Dragon 8.3 476157
#> 40: tt2193021 Arrow 7.5 454174
#> 41: tt0096697 The Simpsons 8.6 453501
#> 42: tt2802850 Fargo 8.8 444783
#> 43: tt9140554 Loki 8.2 444297
#> 44: tt4158110 Mr. Robot 8.5 441171
#> 45: tt0877057 Death Note 8.9 425280
#> 46: tt7631058 The Lord of the Rings: The Rings of Power 6.9 425217
#> 47: tt0121955 South Park 8.7 422709
#> 48: tt13443470 Wednesday 8.0 416486
#> 49: tt0417299 Avatar: The Last Airbender 9.3 406149
#> 50: tt0306414 The Wire 9.3 403184
#> 51: tt10986410 Ted Lasso 8.8 400570
#> 52: tt9140560 WandaVision 7.9 396101
#> 53: tt11126994 Arcane 9.0 395203
#> 54: tt2467372 Brooklyn Nine-Nine 8.4 394037
#> 55: tt7767422 Sex Education 8.2 378893
#> 56: tt0182576 Family Guy 8.1 377439
#> 57: tt3107288 The Flash 7.5 376981
#> 58: tt4052886 Lucifer 8.0 375645
#> 59: tt5071412 Ozark 8.5 370020
#> 60: tt1796960 Homeland 8.3 369549
#> 61: tt0098904 Seinfeld 8.9 368829
#> 62: tt5290382 Mindhunter 8.6 367537
#> 63: tt1405406 The Vampire Diaries 7.7 366922
#> 64: tt0413573 Grey's Anatomy 7.6 359585
#> 65: tt1844624 American Horror Story 7.9 354626
#> 66: tt1124373 Sons of Anarchy 8.5 340210
#> 67: tt7335184 You 7.7 336329
#> 68: tt0367279 Arrested Development 8.6 334740
#> 69: tt2372162 Orange Is the New Black 8.0 328947
#> 70: tt1837492 13 Reasons Why 7.4 328734
#> 71: tt7660850 Succession 8.8 317172
#> 72: tt11280740 Severance 8.7 316482
#> 73: tt14392248 Aspirants 9.2 315616
#> 74: tt12637874 Fallout 8.3 313914
#> 75: tt1586680 Shameless 8.5 310910
#> 76: tt6763664 The Haunting of Hill House 8.5 309942
#> 77: tt1312171 The Umbrella Academy 7.8 303758
#> 78: tt1439629 Community 8.5 303410
#> 79: tt1266020 Parks and Recreation 8.6 302909
#> 80: tt0388629 One Piece 9.0 297137
#> tconst title averageRating numVotes
#> genres runtimeMinutes startYear endYear titleType
#> <char> <num> <int> <int> <fctr>
#> 1: Action,Adventure,Drama 60 2011 2019 tvSeries
#> 2: Crime,Drama,Thriller 45 2008 2013 tvSeries
#> 3: Drama,Fantasy,Horror 60 2016 2025 tvSeries
#> 4: Comedy,Romance 22 1994 2004 tvSeries
#> 5: Drama,Horror,Thriller 45 2010 2022 tvSeries
#> 6: Crime,Drama,Mystery 90 2010 2017 tvSeries
#> 7: Drama,History,Thriller 60 2019 2019 tvMiniSeries
#> 8: Comedy,Romance 22 2007 2019 tvSeries
#> 9: Crime,Drama,Mystery 60 2006 2013 tvSeries
#> 10: Action,Comedy,Crime 60 2019 NA tvSeries
#> 11: Comedy 22 2005 2013 tvSeries
#> 12: Comedy,Drama,Romance 23 2005 2014 tvSeries
#> 13: Crime,Drama 45 2015 2022 tvSeries
#> 14: Crime,Drama 60 2013 2022 tvSeries
#> 15: Crime,Drama,Mystery 60 2014 NA tvSeries
#> 16: Crime,Drama,Mystery 60 2011 NA tvSeries
#> 17: Action,Drama,Mystery 60 2021 2025 tvSeries
#> 18: Action,Adventure,Drama 50 2023 NA tvSeries
#> 19: Adventure,Animation,Comedy 23 2013 NA tvSeries
#> 20: Adventure,Drama,Fantasy 45 2004 2010 tvSeries
#> 21: Action,Crime,Drama 44 2005 2017 tvSeries
#> 22: Action,Adventure,Fantasy 40 2019 NA tvSeries
#> 23: Drama 60 2020 2020 tvMiniSeries
#> 24: Action,Adventure,Drama 45 2013 2020 tvSeries
#> 25: Action,Adventure,Animation 24 2013 2023 tvSeries
#> 26: Action,Adventure,Drama 60 2019 NA tvSeries
#> 27: Action,Crime,Drama 60 2017 2021 tvSeries
#> 28: Action,Drama,History 60 2001 2001 tvMiniSeries
#> 29: Drama,Mystery 45 2004 2012 tvSeries
#> 30: Drama,Mystery,Sci-Fi 60 2016 2022 tvSeries
#> 31: Drama 50 2013 2018 tvSeries
#> 32: Crime,Drama 60 1999 2007 tvSeries
#> 33: Comedy,Drama,Romance 22 2009 2020 tvSeries
#> 34: Comedy,Drama 45 2011 2019 tvSeries
#> 35: Drama,Fantasy,Horror 44 2005 2020 tvSeries
#> 36: Action,Crime,Drama 60 2015 2018 tvSeries
#> 37: Biography,Crime,Drama 50 2015 2017 tvSeries
#> 38: Crime,Drama,Mystery 60 2017 2020 tvSeries
#> 39: Action,Adventure,Drama 50 2022 NA tvSeries
#> 40: Action,Adventure,Crime 42 2012 2020 tvSeries
#> 41: Animation,Comedy 22 1989 NA tvSeries
#> 42: Crime,Drama,Thriller 60 2014 2024 tvSeries
#> 43: Action,Adventure,Fantasy 50 2021 2023 tvSeries
#> 44: Crime,Drama,Thriller 45 2015 2019 tvSeries
#> 45: Animation,Crime,Drama 24 2006 2007 tvSeries
#> 46: Action,Adventure,Drama 60 2022 NA tvSeries
#> 47: Animation,Comedy 22 1997 NA tvSeries
#> 48: Comedy,Crime,Fantasy 45 2022 NA tvSeries
#> 49: Action,Adventure,Animation 23 2005 2008 tvSeries
#> 50: Crime,Drama,Thriller 60 2002 2008 tvSeries
#> 51: Comedy,Drama,Sport 30 2020 NA tvSeries
#> 52: Action,Comedy,Drama 39 2021 2021 tvMiniSeries
#> 53: Action,Adventure,Animation 40 2021 2024 tvSeries
#> 54: Comedy,Crime 22 2013 2021 tvSeries
#> 55: Comedy,Drama,Romance 60 2019 2023 tvSeries
#> 56: Animation,Comedy 22 1999 NA tvSeries
#> 57: Action,Adventure,Drama 43 2014 2023 tvSeries
#> 58: Crime,Drama,Fantasy 45 2016 2021 tvSeries
#> 59: Crime,Drama,Thriller 60 2017 2022 tvSeries
#> 60: Crime,Drama,Mystery 60 2011 2020 tvSeries
#> 61: Comedy 22 1989 1998 tvSeries
#> 62: Crime,Drama,Mystery 60 2017 2019 tvSeries
#> 63: Drama,Fantasy,Horror 43 2009 2017 tvSeries
#> 64: Drama,Romance 45 2005 NA tvSeries
#> 65: Drama,Horror,Sci-Fi 60 2011 NA tvSeries
#> 66: Crime,Drama,Thriller 45 2008 2014 tvSeries
#> 67: Crime,Drama,Romance 45 2018 2025 tvSeries
#> 68: Comedy 22 2003 2019 tvSeries
#> 69: Comedy,Crime,Drama 60 2013 2019 tvSeries
#> 70: Drama,Mystery,Thriller 60 2017 2020 tvSeries
#> 71: Comedy,Drama 60 2018 2023 tvSeries
#> 72: Drama,Mystery,Sci-Fi 50 2022 NA tvSeries
#> 73: Drama 45 2021 NA tvSeries
#> 74: Action,Adventure,Drama 60 2024 NA tvSeries
#> 75: Comedy,Drama 60 2011 2021 tvSeries
#> 76: Drama,Horror,Mystery 60 2018 2018 tvMiniSeries
#> 77: Action,Adventure,Comedy 60 2019 2024 tvSeries
#> 78: Comedy 22 2009 2015 tvSeries
#> 79: Comedy 22 2009 2015 tvSeries
#> 80: Action,Adventure,Animation 24 1999 NA tvSeries
#> genres runtimeMinutes startYear endYear titleTypeMost Episodes
tv_episodes %>%
group_by(tconst_series, series_name) %>%
summarize(seasons = length(unique(seasonNumber)),
episodes = n(),
start = min(episode_year, na.rm = T),
last = max(episode_year, na.rm = T)) %>%
arrange(desc(episodes)) %>%
ungroup() %>%
head(80)
#> # A tibble: 80 × 6
#> tconst_series series_name seasons episodes start last
#> <chr> <chr> <int> <int> <dbl> <dbl>
#> 1 tt0115147 The Daily Show 30 3907 1996 2025
#> 2 tt0084987 The Bill 26 2403 1984 2010
#> 3 tt3444938 The Tonight Show Starring Jimmy F… 12 2099 2014 2025
#> 4 tt0185103 WWE Raw 33 1669 1993 2025
#> 5 tt3697842 The Late Show with Stephen Colbert 10 1662 2015 2025
#> 6 tt0458254 The Colbert Report 12 1357 2005 2014
#> 7 tt0227972 WWE Smackdown! 27 1344 1999 2025
#> 8 tt1820166 Ridiculousness 42 1309 2011 2024
#> 9 tt0806910 Tatort 1 1303 1970 2025
#> 10 tt0088512 EastEnders 1 1257 1985 2025
#> # ℹ 70 more rowsEpisode Heatmap
series_id <- "tt0096697" # id for "The Simpsons"
series_episodes <- tv_episodes %>%
filter(tconst_series == series_id,
episodeNumber > 0) %>%
arrange(seasonNumber,episodeNumber)
# Add in x and y coordinates to draw boxes
plot_eps <- series_episodes %>%
select(series_name:episode_rating) %>%
mutate(x_start = episodeNumber - 0.5,
x_end = episodeNumber + 0.5,
y_start = seasonNumber - 0.5,
y_end = seasonNumber + 0.5) %>%
pivot_longer(c(x_start,x_end),
names_to = NULL,
values_to = "x") %>%
pivot_longer(c(y_start,y_end),
names_to = NULL,
values_to = "y") %>%
group_by(seasonNumber, episodeNumber, episode_name) %>%
mutate(draw_order = case_when(x == min(x) & y == min(y) ~ 1,
x == min(x) & y == max(y) ~ 2,
x == max(x) & y == max(y) ~ 3,
x == max(x) & y == min(y) ~ 4)) %>%
arrange(seasonNumber, episodeNumber, draw_order)
episode_heatmap <- plot_eps %>%
ggplot(aes(x=episodeNumber,
y=seasonNumber,
group = paste(seasonNumber, episodeNumber),
# color = episode_rating,
fill = episode_rating)) +
geom_polygon(aes(x=x, y=y)) + #COORDINATES USED HERE
geom_text(aes(label = episode_rating),
size = 2) +
scale_fill_distiller(palette = 'RdYlGn',
direction = 1,
limits = c(5,10),
label = label_number(accuracy = 1),
name = 'IMDB Rating') +
scale_x_continuous(breaks = seq(1, max(series_episodes$episodeNumber),1),
name = "Episode") +
scale_y_reverse(breaks = 1:max(series_episodes$seasonNumber),
name = "Season") +
labs(title = series_episodes$series_name[1]) +
theme_minimal() +
theme(axis.line = element_blank(),
plot.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.major.x = element_line(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
plot.title = element_text(size = 20, face = "bold"),
axis.title = element_text(size = 20),
axis.text = element_text(face = "bold",
size = 7))Top episodes:
topten_episodes <- series_episodes %>%
arrange(desc(episode_rating)) %>%
mutate(season_episode = paste0(seasonNumber,
"–",
episodeNumber)) %>%
select(season_episode,
episode_name,
episode_rating,
numVotes,
episode_year) %>%
head(20)
ymin <- 5
topten_episodes %>%
mutate(ratingoffset = episode_rating - ymin,
season_episode = factor(season_episode,
levels = season_episode)) %>%
ggplot(aes(x=season_episode,
y = ratingoffset,
fill = episode_rating)) +
geom_col(color = 'black') +
geom_text(aes(label = episode_name,
y=0.1,
angle = 90,
hjust = 0))+
geom_text(aes(label = episode_rating,
vjust = -0.2))+
scale_y_continuous(labels = function(n) n + ymin,
name = "Rating") +
theme(axis.text.x = element_text(angle = 45,
vjust = 0.8))+
scale_fill_distiller(palette = 'RdYlGn',
direction = 1,
limits = c(5,10),
label = label_number(accuracy = 1),
name = 'IMDB Rating')
Season Ratings:
season_ratings <- series_episodes %>%
group_by(seasonNumber) %>%
summarize(rating = mean(episode_rating) %>% round(1))
ymin <- 5
season_ratings %>%
mutate(ratingoffset = rating - ymin) %>%
ggplot(aes(x = seasonNumber, y = ratingoffset, fill = rating)) +
geom_col(color = 'black') +
scale_y_continuous(labels = function(n) n + ymin,
name = 'Rating') +
scale_fill_distiller(palette = 'RdYlGn',
direction = 1,
limits = c(5,10),
label = label_number(accuracy = 1),
name = 'IMDB Rating') 
