Analyzing data for #tidytuesday week of 12/4/2018 (source)
# LOAD PACKAGES AND PARSE DATA
library(tidyverse)
library(scales)
library(RColorBrewer)
library(forcats)
library(tidytext)
library(stringr)
articles_raw <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2018/2018-12-04/medium_datasci.csv")
articles <- articles_raw
Are there differences in words used between the titles and subtitles?
Clean up data before we text mine the top 10 authors
data(stop_words)
tidy_authors <-
articles %>%
inner_join(top_authors) %>%
select(title, subtitle, author) %>%
na.omit() %>%
mutate(text = paste(title, " ", subtitle)) %>%
select(author, text) %>%
unnest_tokens(word, text) %>%
anti_join(stop_words)
Calculate proportions and plot graph
tidy_authors %>%
group_by(author) %>%
mutate(word = str_extract(word, "[a-z']+")) %>%
count(word, sort = TRUE) %>%
mutate(proportion = n / sum(n)) %>%
select(-n) %>%
spread(author, proportion) %>%
gather(author, proportion, `AI Hawk`:`Synced`) %>%
ggplot(aes(x=proportion, y=`Yves Mulkers`, color = abs(`Yves Mulkers` - proportion))) +
geom_jitter(alpha = 0.1,
size = 0.5,
width = 0.25,
height = 0.25) +
geom_text(aes(label = word),
check_overlap = TRUE,
vjust = 1,
hjust = 1) +
geom_abline(color = "darkslategray4",
linetype = 2) +
scale_color_gradient(limits = c(0, 0.01),
low = "salmon",
high = "blue") +
scale_x_log10(labels = percent_format(round(1))) +
scale_y_log10(labels = percent_format(round(1))) +
labs(y = "Yves Mulkers",
x = "",
title = "Comparing the word frequencies for the top 10 authors on Medium",
subtitle = " \"Top 10\" defined as the total number of articles published") +
theme_bw() +
theme(legend.position = "none") +
facet_wrap(~author, ncol = 3)