TidyTuesday: Women in the Workforce

Analyzing data for #tidytuesday week of 3/05/2019 (source) Load libraries library(tidyverse) library(scales) library(lubridate) jobs_gender <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-03-05/jobs_gender.csv") Clean & plot data jobs_gender %>% filter(year == '2016') %>% mutate(male_diff = ((((total_earnings_male/total_earnings)-1)*workers_male)/total_workers), female_diff = (((total_earnings_female/total_earnings)-1)*workers_female)/total_workers) %>% ggplot() + geom_jitter(aes(total_earnings, female_diff), color = 'salmon', alpha = 0.5, size = 2.5) + geom_jitter(aes(total_earnings, male_diff), color = 'steelblue', alpha = 0.5, size = 2.5) + geom_hline(yintercept = 0, color = 'grey54', lty = 'dashed') + facet_wrap(~major_category) + scale_x_continuous(labels = dollar_format(), limits = c(0,200000)) + scale_y_continuous(labels = percent_format(round(1)), limits = c(-0.3,0.3)) + labs(x = "Average Median Earnings", y = "Difference from Average", caption = "Graphic: @eeysirhc\nSource: Bureau of Labor Statistics", title = "2016 Earnings Differences (Weighted) by Job Sector", subtitle = "Blue = Male; Red = Female") + theme_bw() + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), plot.subtitle = element_text(size = 12), legend.position = 'none') ...

March 5, 2019 · Christopher Yee

Hello, can we stop using pie charts?

I came across this tweet and its corresponding graph a few days ago: Did you know? 🧐 1‐word keywords account for only 2.8% of all the keywords people search for in the United States. pic.twitter.com/GXdfttn3jk — Tim Soulo (@timsoulo) February 21, 2019 I love ahrefs and all but it’s 2019 - WHY ARE WE STILL USING PIE CHARTS?! I’ll spare my opinion since there is already a ton of literature out but here’s a few to get started: ...

February 22, 2019 · Christopher Yee

TidyTuesday: Housing Prices

Instead of a static visualization I decided to build a barebones Shiny app this week. The purpose is to improve the interactivity of the final output - one of my 2019 goals to level up advanced R knowledge. You can find the full code here. Analyzing data for #tidytuesday week of 2/5/2019 (source) # LOAD PACKAGES library(tidyverse) library(scales) library(shiny) state_hpi_raw <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-02-05/state_hpi.csv") Process the raw data state_hpi <- state_hpi_raw %>% group_by(state, year) %>% summarize(us_avg = mean(us_avg), price_index = mean(price_index)) %>% mutate(pct_diff = (price_index / us_avg) - 1, segment = ifelse(pct_diff > 0, 'above', 'below'), segment = str_to_title(segment)) Build the UI level Include a drop down menu to select output data by state abbreviation ui <- fluidPage( "Housing Price Index: US Average vs State", selectInput(inputId = "select_state", label = "Choose a state", c(state.abb)), plotOutput("hpi1"), plotOutput("hpi2") ) Build the server level Plot 1: Time series for a given state average annual housing price index compared to the US average Plot 2: Time series for the percentage difference of a given state housing price index against the US average server <- function(input, output, session) { output$hpi1 <- renderPlot({ state_hpi %>% filter(state == input$select_state) %>% group_by(year, state) %>% summarize(price_index = mean(price_index), us_avg = mean(us_avg)) %>% ggplot() + geom_line(aes(year, price_index), size = 2, color = 'steelblue') + geom_col(aes(year, us_avg), alpha = 0.3, fill = 'grey54') + theme_bw() + labs(x = NULL, y = "Housing Price Index") + theme_bw(base_size = 15) + scale_y_continuous(limits = c(0,300)) }) output$hpi2 <- renderPlot({ state_hpi %>% filter(state == input$select_state) %>% ggplot() + geom_col(aes(year, pct_diff, fill = segment), alpha = 0.8) + geom_hline(yintercept = 0, lty = 'dashed') + scale_fill_brewer(palette = 'Set1', direction = -1) + scale_y_continuous(labels = percent_format(round(1))) + theme_bw(base_size = 15) + theme(legend.position = 'top') + labs(x = NULL, y = "Difference to US Average", fill = NULL) }) } Build the app level shinyApp(ui, server) Check out the final production build here ...

February 5, 2019 · Christopher Yee

TidyTuesday: Milk Production

Analyzing data for #tidytuesday week of 1/29/2019 (source) # LOAD PACKAGES library(tidyverse) library(scales) library(lubridate) library(ggmap) library(gganimate) library(ggthemes) library(transformr) library(gifski) library(mapproj) milk_raw <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-29/state_milk_production.csv") milk <- milk_raw Extract geospatial data and parse data usa <- as_tibble(map_data("state")) usa$region <- str_to_title(usa$region) usa <- usa %>% rename(state = region) milk_parsed <- milk %>% select(-region) %>% mutate(milk_10billion = milk_produced / 10000000000, year = as.integer(year)) %>% full_join(usa) %>% filter(!is.na(year), !is.na(long), !is.na(lat)) Build animation milk_animation <- milk_parsed %>% ggplot(aes(long, lat, group = group, fill = milk_10billion)) + geom_polygon(color = 'black') + scale_fill_gradient2(low = "gray97", mid = "steelblue", high = "midnightblue", midpoint = 2.5) + theme_map() + coord_map() + labs(x = NULL, y = NULL, fill = NULL, title = "Milk production per 10 billion pounds", subtitle = "Year: {round(frame_time)}", caption = "Source: USDA") + transition_time(year) animate(milk_animation, height = 800, width = 800) ...

January 29, 2019 · Christopher Yee

TidyTuesday: Incarceration Trends

Analyzing data for #tidytuesday week of 1/22/2019 (source) # LOAD PACKAGES AND PARSE DATA library(tidyverse) library(scales) library(lubridate) library(RColorBrewer) prison_raw <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-22/prison_population.csv") prison <- prison_raw Process the raw data total <- prison %>% filter(pop_category != 'Total' & pop_category != 'Male' & pop_category != 'Female') %>% select(county_name, urbanicity, pop_category, population, prison_population) %>% na.omit() %>% group_by(county_name, urbanicity, pop_category) %>% summarize(population = sum(population), prison_population = sum(prison_population)) %>% ungroup() %>% group_by(county_name, urbanicity) %>% mutate(pct_population = population / sum(population), pct_prisoner = prison_population / sum(prison_population)) What is the proportion of population:prisoners per demographic group ? total %>% filter(pop_category != 'Other') %>% ggplot() + geom_point(aes(pct_population, pct_prisoner), alpha = 0.1, size = 2, color = 'grey') + geom_smooth(aes(pct_population, pct_prisoner, color = pop_category), size = 1.2, se = FALSE) + theme_light() + scale_y_continuous(labels = percent_format()) + scale_x_continuous(labels = percent_format()) + labs(x = "County Population", y = "Prisoner Population", color = "", title = "Comparison of county to prison population by ethnicity from 1970 to 2016", subtitle = "Specific groups are overrepresented in the prisoner population", caption = "Source: Vera Institute of Justice") + geom_abline(linetype = 'dashed') + scale_color_brewer(palette = 'Set1') + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), legend.position = 'top', panel.background = element_rect(fill = 'gray97', color = 'gray97', size = 0.5, linetype = 'solid')) ...

January 22, 2019 · Christopher Yee

TidyTuesday: Space Launches

Analyzing data for #tidytuesday week of 1/15/2019 (source) # LOAD PACKAGES AND PARSE DATA library(tidyverse) library(RColorBrewer) library(forcats) library(scales) library(ebbr) launches_raw <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/launches.csv") launches <- launches_raw %>% filter(launch_year >= '1960') Distribution of the most space launches over time? countries <- launches %>% count(state_code, sort = TRUE) %>% filter(n >= 100) launches %>% inner_join(countries) %>% # INCOMING NASTY IFELSE CODE (NEED TO REFACTOR) mutate(state_code = ifelse(state_code == 'RU', 'Russia / Soviet Union', ifelse(state_code == 'SU', 'Russia / Soviet Union', ifelse(state_code == 'US', 'United States', ifelse(state_code == 'CN', 'China', ifelse(state_code == 'IN', 'India', ifelse(state_code == 'F', 'France', ifelse(state_code == 'J', 'Japan', state_code)))))))) %>% ggplot() + geom_density(aes(launch_year, fill = state_code, color = state_code), alpha = 0.2) + theme_light() + scale_color_brewer(palette = 'Set1') + scale_fill_brewer(palette = 'Set1') + labs(x = "", y = "", title = "Distribution of space launches over time by country", subtitle = "Minimum of 100 launches", caption = "Source: The Economist", fill = "Country", color = "Country") + scale_y_continuous(labels = percent_format(round(1))) ...

January 15, 2019 · Christopher Yee

TidyTuesday: TV Golden Age

Analyzing data for #tidytuesday week of 01/08/2019 (source) # LOAD PACKAGES AND PARSE DATA library(knitr) library(tidyverse) library(RColorBrewer) library(forcats) library(lubridate) library(broom) tv_data_raw <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-08/IMDb_Economist_tv_ratings.csv") tv_data <- tv_data_raw Prepare the data for k-means clustering tv_data_summarized <- tv_data %>% group_by(title, genres, date) %>% summarize(min_rating = min(av_rating), avg_rating = mean(av_rating), max_rating = max(av_rating), min_share = min(share), avg_share = mean(share), max_share = max(share)) %>% ungroup() kclust_data <- tv_data_summarized %>% select(-title, -genres, -date) kclust_results <- kmeans(kclust_data, center = 9) Check output data (boxplot) # CHECK OUTPUT DATA tv_data_summarized %>% left_join(augment(kclust_results, kclust_data)) %>% mutate(title = factor(title)) %>% group_by(.cluster) %>% ggplot() + geom_boxplot(aes(.cluster, avg_rating, fill = .cluster), show.legend = FALSE, alpha = 0.5) + theme_light() + labs(x = "Cluster #", y = "Average Rating", caption = "Source: The Economist", title = "Average rating distribution for each cluster assignment") + scale_fill_brewer(palette = 'Paired') ...

January 8, 2019 · Christopher Yee

TidyTuesday: rtweet Data

Analyzing data for #tidytuesday week of 01/01/2019 (source) # LOAD PACKAGES AND PARSE DATA library(tidyverse) library(scales) library(RColorBrewer) library(forcats) library(tidytext) library(topicmodels) tweets_raw <- as_tibble(readRDS("rstats_tweets.rds")) Parse data and identify top users # IDEA BEHIND THIS IS TO FILTER OUT BOTS # FIND TOP USERS top_interactions <- tweets_raw %>% select(screen_name, favorite_count, retweet_count) %>% group_by(screen_name) %>% summarize(favorite = sum(favorite_count), retweet = sum(retweet_count)) %>% group_by(screen_name) %>% mutate(total = sum(favorite, retweet)) %>% arrange(desc(total)) %>% head(12) # JOIN TOP USERS WITH RAW DATASET tweets <- tweets_raw %>% inner_join(top_interactions, by='screen_name') # FINAL DATA PROCESSING tweets_parsed <- tweets %>% select(screen_name, text) %>% group_by(screen_name) %>% unnest_tokens(word, text) %>% anti_join(stop_words) %>% filter(!grepl("https|t.co|http|bit.ly|kindly|goo.gl|rstats|amp", word)) # REMOVE EXTRA STOP WORDS What are the most significant keywords for each #rstats Twitter user? tweets_tfidf <- tweets_parsed %>% count(screen_name, word, sort = TRUE) %>% ungroup() %>% bind_tf_idf(word, screen_name, n) tweets_tfidf %>% filter(!near(tf, 1)) %>% arrange(desc(tf_idf)) %>% group_by(screen_name) %>% distinct(screen_name, word, .keep_all = TRUE) %>% top_n(10, tf_idf) %>% ungroup() %>% mutate(word = factor(word, levels = rev(unique(word)))) %>% ggplot(aes(word, tf_idf, fill = screen_name)) + geom_col(show.legend = FALSE) + facet_wrap(~screen_name, ncol = 4, scales = "free") + coord_flip() + theme_light() + labs(x = "", y = "", title = "Highest TF-IDF words for top #rstats Twitter users", caption = "Source: data from {rtweet} package") + scale_fill_brewer(palette = 'Paired') ...

January 1, 2019 · Christopher Yee