Analyzing data for #tidytuesday week of 01/08/2019 (source)
# LOAD PACKAGES AND PARSE DATA
library(knitr)
library(tidyverse)
library(RColorBrewer)
library(forcats)
library(lubridate)
library(broom)
tv_data_raw <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-08/IMDb_Economist_tv_ratings.csv")
tv_data <- tv_data_raw
Prepare the data for k-means clustering
tv_data_summarized <- tv_data %>%
group_by(title, genres, date) %>%
summarize(min_rating = min(av_rating),
avg_rating = mean(av_rating),
max_rating = max(av_rating),
min_share = min(share),
avg_share = mean(share),
max_share = max(share)) %>%
ungroup()
kclust_data <- tv_data_summarized %>%
select(-title, -genres, -date)
kclust_results <- kmeans(kclust_data, center = 9)
Check output data (boxplot)
# CHECK OUTPUT DATA
tv_data_summarized %>%
left_join(augment(kclust_results, kclust_data)) %>%
mutate(title = factor(title)) %>%
group_by(.cluster) %>%
ggplot() +
geom_boxplot(aes(.cluster, avg_rating, fill = .cluster),
show.legend = FALSE,
alpha = 0.5) +
theme_light() +
labs(x = "Cluster #",
y = "Average Rating",
caption = "Source: The Economist",
title = "Average rating distribution for each cluster assignment") +
scale_fill_brewer(palette = 'Paired')
Check outputdata (scatterplot)
tv_data_summarized %>%
left_join(augment(kclust_results, kclust_data)) %>%
mutate(title = factor(title)) %>%
group_by(.cluster) %>%
ggplot(aes(avg_rating, log10(avg_share)+1, color = .cluster)) +
geom_point(alpha = 0.7, size = 3, show.legend = FALSE) +
theme_light() +
labs(x = "Average Rating",
y = "Share (log10)",
caption = "Source: The Economist",
title = "Relationship between Average Rating and Shares by cluster assignment") +
scale_fill_brewer(palette = 'Paired')
Finalize the plot
tv_data_summarized %>%
left_join(augment(kclust_results, kclust_data)) %>%
mutate(title = factor(title),
five_years = 5 * (year(date) %/% 5 )) %>%
group_by(.cluster) %>%
top_n(20, avg_rating) %>%
ggplot(aes(avg_rating, log10(avg_share)+1, label = title, color = .cluster)) +
geom_text(show.legend = FALSE) +
facet_wrap(~five_years) +
theme_light() +
labs(x = "Average Rating",
y = "Share (log10)",
caption = "Source: The Economist",
title = "Top TV Shows Every 5yrs by Average Rating and Shares (log10)",
subtitle = "Note: duplicates indicate multiple seasons")