This R Notebook is the complement to my blog post Problems with Predicting Post Performance on Reddit and Other Link Aggregators.
This notebook is licensed under the MIT License. If you use the code or data visualization designs contained within this notebook, it would be greatly appreciated if proper attribution is given back to this notebook and/or myself. Thanks! :)
library(tidyverse)
library(scales)
library(ggridges)
library(bigrquery)
sessionInfo()
R version 3.5.0 (2018-04-23)
Platform: x86_64-apple-darwin15.6.0 (64-bit)
Running under: macOS High Sierra 10.13.6
Matrix products: default
BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] bigrquery_1.0.0 ggridges_0.5.0 scales_1.0.0 forcats_0.3.0 stringr_1.3.1
[6] dplyr_0.7.6 purrr_0.2.5 readr_1.1.1 tidyr_0.8.1 tibble_1.4.2
[11] ggplot2_3.0.0 tidyverse_1.2.1
loaded via a namespace (and not attached):
[1] Rcpp_0.12.18 cellranger_1.1.0 pillar_1.3.0 compiler_3.5.0 plyr_1.8.4
[6] bindr_0.1.1 tools_3.5.0 jsonlite_1.5 lubridate_1.7.4 nlme_3.1-137
[11] gtable_0.2.0 lattice_0.20-35 pkgconfig_2.0.2 rlang_0.2.2 DBI_1.0.0
[16] cli_1.0.0 rstudioapi_0.7 yaml_2.2.0 haven_1.1.2 bindrcpp_0.2.2
[21] withr_2.1.2 xml2_1.2.0 httr_1.3.1 knitr_1.20 hms_0.4.2
[26] grid_3.5.0 tidyselect_0.2.4 glue_1.3.0 R6_2.2.2 readxl_1.1.0
[31] modelr_0.1.2 magrittr_1.5 backports_1.1.2 rvest_0.3.2 assertthat_0.2.0
[36] colorspace_1.3-2 stringi_1.2.4 lazyeval_0.2.1 munsell_0.5.0 broom_0.5.0
[41] crayon_1.3.4
theme_set(theme_minimal(base_size=9, base_family="Source Sans Pro") +
theme(plot.title = element_text(size=8, family="Source Sans Pro Bold", margin=margin(t = -0.1, b = 0.1, unit='cm')),
axis.title.x = element_text(size=8),
axis.title.y = element_text(size=8),
plot.subtitle = element_text(family="Source Sans Pro Semibold", color="#969696", size=6),
plot.caption = element_text(size=6, color="#969696"),
legend.title = element_text(size=8),
legend.key.width = unit(0.25, unit='cm')))
BigQuery Project ID (change to your own)
project_id <- "poetic-analog-126704"
BigQuery:
#standardSQL
SELECT
subreddit,
post_hour,
post_weekday,
COUNT(*) as num_instances,
ROUND(AVG(score)) as avg_score,
perc_25, perc_50, perc_75
FROM (
SELECT *,
PERCENTILE_CONT(score, 0.25) OVER (PARTITION BY subreddit, post_hour, post_weekday) as perc_25,
PERCENTILE_CONT(score, 0.50) OVER (PARTITION BY subreddit, post_hour, post_weekday) as perc_50,
PERCENTILE_CONT(score, 0.75) OVER (PARTITION BY subreddit, post_hour, post_weekday) as perc_75
FROM (
SELECT
EXTRACT(HOUR FROM TIMESTAMP_SECONDS(created_utc) AT TIME ZONE "America/New_York") as post_hour,
EXTRACT(DAYOFWEEK FROM TIMESTAMP_SECONDS(created_utc) AT TIME ZONE "America/New_York") as post_weekday,
subreddit,
score
FROM `fh-bigquery.reddit_posts.*`
WHERE _TABLE_SUFFIX BETWEEN "2017_01" AND "2018_05"
AND subreddit IN (
SELECT subreddit
FROM `fh-bigquery.reddit_posts.*`
WHERE _TABLE_SUFFIX BETWEEN "2017_01" AND "2018_05"
GROUP BY subreddit
ORDER BY APPROX_COUNT_DISTINCT(author) DESC
LIMIT 100
)
)
)
GROUP BY subreddit, post_hour, post_weekday, perc_25, perc_50, perc_75
ORDER BY subreddit, post_hour, post_weekday
query <- '
#standardSQL
SELECT
subreddit,
post_hour,
post_weekday,
COUNT(*) as num_instances,
ROUND(AVG(score)) as avg_score,
perc_25, perc_50, perc_75
FROM (
SELECT *,
PERCENTILE_CONT(score, 0.25) OVER (PARTITION BY subreddit, post_hour, post_weekday) as perc_25,
PERCENTILE_CONT(score, 0.50) OVER (PARTITION BY subreddit, post_hour, post_weekday) as perc_50,
PERCENTILE_CONT(score, 0.75) OVER (PARTITION BY subreddit, post_hour, post_weekday) as perc_75
FROM (
SELECT
EXTRACT(HOUR FROM TIMESTAMP_SECONDS(created_utc) AT TIME ZONE "America/New_York") as post_hour,
EXTRACT(DAYOFWEEK FROM TIMESTAMP_SECONDS(created_utc) AT TIME ZONE "America/New_York") as post_weekday,
subreddit,
score
FROM `fh-bigquery.reddit_posts.*`
WHERE _TABLE_SUFFIX BETWEEN "2017_01" AND "2018_05"
AND subreddit IN (
SELECT subreddit
FROM `fh-bigquery.reddit_posts.*`
WHERE _TABLE_SUFFIX BETWEEN "2017_01" AND "2018_05"
GROUP BY subreddit
ORDER BY APPROX_COUNT_DISTINCT(author) DESC
LIMIT 100
)
)
)
GROUP BY subreddit, post_hour, post_weekday, perc_25, perc_50, perc_75
ORDER BY subreddit, post_hour, post_weekday
'
df_reddit_hour_doy <- bq_project_query(project_id, query, use_legacy_sql=F) %>%
bq_table_download()
Running job [-] 1s
Running job [\] 1s
Running job [|] 2s
Running job [/] 2s
Running job [-] 3s
Running job [\] 3s
Running job [|] 3s
Running job [/] 3s
Running job [-] 4s
Running job [\] 4s
Running job [|] 5s
Running job [/] 5s
Running job [-] 5s
Running job [\] 5s
Running job [|] 6s
Running job [/] 6s
Running job [-] 7s
Running job [\] 7s
Running job [|] 7s
Running job [/] 7s
Running job [-] 8s
Running job [\] 8s
Running job [|] 9s
Running job [/] 9s
Running job [-] 9s
Running job [\] 9s
Running job [|] 10s
Running job [/] 10s
Running job [-] 11s
Running job [\] 11s
Running job [|] 11s
Running job [/] 11s
Running job [-] 12s
Running job [\] 12s
Running job [|] 12s
Running job [/] 13s
Running job [-] 13s
Running job [\] 13s
Running job [|] 14s
Running job [/] 14s
Running job [-] 14s
Running job [\] 15s
Running job [|] 15s
Running job [/] 15s
Running job [-] 16s
Running job [\] 16s
Running job [|] 16s
Running job [/] 17s
Running job [-] 17s
Running job [\] 17s
Running job [|] 18s
Running job [/] 18s
Running job [-] 18s
Running job [\] 19s
Running job [|] 19s
Running job [/] 19s
Running job [-] 20s
Running job [\] 20s
Running job [|] 20s
Running job [/] 20s
Running job [-] 21s
Running job [\] 21s
Running job [|] 22s
Running job [/] 22s
Running job [-] 22s
Complete
Billed: 7.27 GB
Downloading 16,771 rows in 2 pages.
Downloading data [=============================================================] 100% ETA: 0s
Parsing [======================================-------------------------------------] ETA: 0s
Parsing [===========================================================================] ETA: 0s
df_reddit_hour_doy %>% head()
Mutate the numeric hour/day-of-week into named factors.
0-23
, 12 AM is 01-7
, 1 is Sundayhour_labels <- c(paste(c(12, 1:11), "AM"), paste(c(12, 1:11), "PM"))
print(hour_labels)
[1] "12 AM" "1 AM" "2 AM" "3 AM" "4 AM" "5 AM" "6 AM" "7 AM" "8 AM" "9 AM" "10 AM"
[12] "11 AM" "12 PM" "1 PM" "2 PM" "3 PM" "4 PM" "5 PM" "6 PM" "7 PM" "8 PM" "9 PM"
[23] "10 PM" "11 PM"
doy_labels <- c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday")
df_reddit_hour_doy <- df_reddit_hour_doy %>%
mutate(post_hour = factor(post_hour, labels=hour_labels),
post_weekday = factor(post_weekday, labels=doy_labels))
df_reddit_hour_doy %>% mutate()
Additionally, normalize post counts by subreddit.
df_reddit_hour_doy <- df_reddit_hour_doy %>%
group_by(subreddit) %>%
mutate(prop = num_instances / sum(num_instances))
df_reddit_hour_doy %>% select(subreddit, num_instances, prop) %>% head()
Tabulate the Subreddits. If we want to take the Top n subreddits excluding certain ones, we can filter down from this list.
df_top_subreddits <- df_reddit_hour_doy %>%
group_by(subreddit) %>%
summarize(total_posts = sum(num_instances)) %>%
arrange(desc(total_posts))
df_top_subreddits %>% head(50)
# exclude subreddits which have high medians
sub_exclude <- c("The_Donald", "PrequelMemes", "aww", "cats", "CrappyDesign", "politics", "CircleofTrust", "gonewild", "news", "technology")
top_subreddits <- df_top_subreddits %>%
filter(!(subreddit %in% sub_exclude)) %>%
head(50) %>%
pull(subreddit)
plot <- ggplot(df_reddit_hour_doy %>% filter(subreddit %in% top_subreddits), aes(x=post_hour, y=fct_rev(post_weekday), fill=perc_50)) +
geom_raster(stat="identity", interpolate=F) +
geom_vline(xintercept=9 - 0.5, color="white", size=0.25, alpha=1) +
geom_vline(xintercept=17 - 0.5, color="white", size=0.25, alpha=1) +
scale_x_discrete() +
scale_y_discrete() +
scale_fill_viridis_c(option="plasma") +
facet_wrap(~ subreddit, nrow=10, ncol=5) +
labs(title='Median Score of Reddit Submissions For 50 Top Subreddits, by Time Posted',
subtitle='For Posts Made January 2017 to May 2018. Vertical lines indicate 9 AM - 5 PM Eastern. Top Subreddits determined by # of unique submitters.',
x='Hour Reddit Post Was Made (12 AM — 11 PM Eastern Time)',
y='Day of Week Reddit Post Was Made',
fill='Median Score for Posts\nMade at Time on Subreddit',
caption = "Max Woolf — minimaxir.com") +
theme(legend.position = 'top',
legend.title = element_text(size = 6),
legend.text = element_text(size = 5),
legend.key.width = unit(1, unit='cm'),
legend.key.height = unit(0.25, unit='cm'),
legend.margin = margin(c(0, 0, -0.4, 0), unit='cm'),
axis.text.x = element_blank(),
axis.text.y = element_text(size = 5))
ggsave('reddit_subreddit_hr_doy.png', plot, width=6, height=8)
# exclude subreddits which have unexplained spikes
sub_exclude <- c("CircleofTrust", "Music", "business", "news", "technology")
top_subreddits <- df_top_subreddits %>%
filter(!(subreddit %in% sub_exclude)) %>%
head(50) %>%
pull(subreddit)
plot <- ggplot(df_reddit_hour_doy %>% filter(subreddit %in% top_subreddits), aes(x=post_hour, y=fct_rev(post_weekday), fill=prop)) +
geom_raster(stat="identity", interpolate=F) +
geom_vline(xintercept=9 - 0.5, color="white", size=0.25, alpha=1) +
geom_vline(xintercept=17 - 0.5, color="white", size=0.25, alpha=1) +
scale_x_discrete() +
scale_y_discrete() +
scale_fill_viridis_c(option="inferno", labels=percent_format(accuracy=0.1)) +
facet_wrap(~ subreddit, nrow=10, ncol=5) +
labs(title='Heat Map of Day-of-Week and Times of Reddit Submissions For 50 Top Subreddits',
subtitle='For Posts Made January 2017 to May 2018. Vertical lines indicate 9 AM — 5 PM Eastern. Top Subreddits determined by # of unique submitters.',
x='Hour Reddit Post Was Made (12 AM — 11 PM Eastern Time)',
y='Day of Week Reddit Post Was Made',
fill='Proportion of All Posts\nMade on Subreddit',
caption = "Max Woolf — minimaxir.com") +
theme(legend.position = 'top',
legend.title = element_text(size = 6),
legend.text = element_text(size = 5),
legend.key.width = unit(1, unit='cm'),
legend.key.height = unit(0.25, unit='cm'),
legend.margin = margin(c(0, 0, -0.4, 0), unit='cm'),
axis.text.x = element_blank(),
axis.text.y = element_text(size = 5))
ggsave('reddit_subreddit_prop.png', plot, width=6, height=8)
Drill down on subreddits with high medians.
sub_exclude <- c("The_Donald", "PrequelMemes", "aww", "cats", "CrappyDesign", "politics")
top_subreddits <- df_top_subreddits %>%
filter(subreddit %in% sub_exclude) %>%
head(50) %>%
pull(subreddit)
plot <- ggplot(df_reddit_hour_doy %>% filter(subreddit %in% top_subreddits), aes(x=post_hour, y=fct_rev(post_weekday), fill=perc_50)) +
geom_raster(stat="identity", interpolate=F) +
geom_vline(xintercept=9 - 0.5, color="white", size=0.25, alpha=1) +
geom_vline(xintercept=17 - 0.5, color="white", size=0.25, alpha=1) +
scale_x_discrete() +
scale_y_discrete() +
scale_fill_viridis_c(option="plasma") +
facet_wrap(~ subreddit, nrow=3, ncol=2) +
labs(title='Median Score of Reddit Submissions For High Median Subreddits, by Time Posted',
subtitle='For Posts Made January 2017 to May 2018. Vertical lines indicate 9 AM - 5 PM Eastern.',
x='Hour Reddit Post Was Made (12 AM — 11 PM Eastern Time)',
y='Day of Week Reddit Post Was Made',
fill='Median Score for Posts\nMade at Time on Subreddit',
caption = "Max Woolf — minimaxir.com") +
theme(legend.position = 'top',
legend.title = element_text(size = 6),
legend.text = element_text(size = 5),
legend.key.width = unit(1, unit='cm'),
legend.key.height = unit(0.25, unit='cm'),
legend.margin = margin(c(0, 0, -0.4, 0), unit='cm'),
axis.text.x = element_blank(),
axis.text.y = element_text(size = 5))
ggsave('reddit_subreddit_highmedian.png', plot, width=6, height=4)
HN vizzes aren’t included in the post, but are here for my own reference.
Similar ETL as above. (but easier since do not need to group by subreddit and labels have already been created)
BigQuery:
#standardSQL
SELECT
post_hour,
post_weekday,
COUNT(*) as num_instances,
ROUND(AVG(score)) as avg_score,
perc_25, perc_50, perc_75
FROM (
SELECT *,
PERCENTILE_CONT(score, 0.25) OVER (PARTITION BY post_hour, post_weekday) as perc_25,
PERCENTILE_CONT(score, 0.50) OVER (PARTITION BY post_hour, post_weekday) as perc_50,
PERCENTILE_CONT(score, 0.75) OVER (PARTITION BY post_hour, post_weekday) as perc_75
FROM (
SELECT
EXTRACT(HOUR FROM timestamp AT TIME ZONE "America/New_York") as post_hour,
EXTRACT(DAYOFWEEK FROM timestamp AT TIME ZONE "America/New_York") as post_weekday,
score
FROM `bigquery-public-data.hacker_news.full`
WHERE DATETIME(timestamp, "America/New_York") BETWEEN "2017-01-01 00:00:00" AND "2018-08-01 00:00:00"
)
)
GROUP BY post_hour, post_weekday, perc_25, perc_50, perc_75
ORDER BY post_hour, post_weekday
query <- '
#standardSQL
SELECT
post_hour,
post_weekday,
COUNT(*) as num_instances,
ROUND(AVG(score)) as avg_score,
perc_25, perc_50, perc_75
FROM (
SELECT *,
PERCENTILE_CONT(score, 0.25) OVER (PARTITION BY post_hour, post_weekday) as perc_25,
PERCENTILE_CONT(score, 0.50) OVER (PARTITION BY post_hour, post_weekday) as perc_50,
PERCENTILE_CONT(score, 0.75) OVER (PARTITION BY post_hour, post_weekday) as perc_75
FROM (
SELECT
EXTRACT(HOUR FROM timestamp AT TIME ZONE "America/New_York") as post_hour,
EXTRACT(DAYOFWEEK FROM timestamp AT TIME ZONE "America/New_York") as post_weekday,
score
FROM `bigquery-public-data.hacker_news.full`
WHERE DATETIME(timestamp, "America/New_York") BETWEEN "2017-01-01 00:00:00" AND "2018-08-01 00:00:00"
)
)
GROUP BY post_hour, post_weekday, perc_25, perc_50, perc_75
ORDER BY post_hour, post_weekday
'
df_hn_hour_doy <- bq_project_query(project_id, query, use_legacy_sql=F) %>%
bq_table_download() %>%
mutate(post_hour = factor(post_hour, labels=hour_labels),
post_weekday = factor(post_weekday, labels=doy_labels),
prop = num_instances / sum(num_instances))
Running job [-] 1s
Running job [\] 2s
Running job [|] 2s
Running job [/] 2s
Running job [-] 3s
Running job [\] 3s
Running job [|] 3s
Complete
Billed: 166.72 MB
Downloading 168 rows in 1 pages.
Parsing [===========================================================================] ETA: 0s
df_hn_hour_doy %>% head()
plot <- ggplot(df_hn_hour_doy, aes(x=post_hour, y=fct_rev(post_weekday), fill=perc_50)) +
geom_raster(stat="identity", interpolate=F) +
geom_vline(xintercept=9 - 0.5, color="white", size=0.25, alpha=1) +
geom_vline(xintercept=17 - 0.5, color="white", size=0.25, alpha=1) +
scale_x_discrete() +
scale_y_discrete() +
scale_fill_viridis_c(option="plasma") +
labs(title='Median Score of Hacker News submissions, by Time Posted',
subtitle='For Posts Made January 2017 to July 2018. Vertical lines indicate 9 AM - 5 PM Eastern.',
x='Hour Hacker News Post Was Made (12 AM — 11 PM Eastern Time)',
y='Day of Week Hacker News Post Was Made',
fill='Median Score for Posts\nMade at Time',
caption = "Max Woolf — minimaxir.com") +
theme(legend.position = 'top',
legend.title = element_text(size = 6),
legend.text = element_text(size = 5),
legend.key.width = unit(1, unit='cm'),
legend.key.height = unit(0.25, unit='cm'),
legend.margin = margin(c(0, 0, -0.4, 0), unit='cm'),
axis.text.x = element_blank(),
axis.text.y = element_text(size = 5))
ggsave('hn_hr_doy.png', plot, width=5, height=3)
plot <- ggplot(df_reddit_hour_doy %>% filter(subreddit %in% top_subreddits), aes(x=post_hour, y=fct_rev(post_weekday), fill=prop)) +
geom_raster(stat="identity", interpolate=F) +
geom_vline(xintercept=9 - 0.5, color="white", size=0.25, alpha=1) +
geom_vline(xintercept=17 - 0.5, color="white", size=0.25, alpha=1) +
scale_x_discrete() +
scale_y_discrete() +
scale_fill_viridis_c(option="inferno", labels=percent_format(accuracy=0.1)) +
labs(title='Distribution of Hacker News Submissions, by Time Posted',
subtitle='For Posts Made January 2017 to August 2018. Vertical lines indicate 9 AM - 5 PM Eastern.',
x='Hour Hacker News Post Was Made (12 AM — 11 PM Eastern Time)',
y='Day of Week Hacker News Post Was Made',
fill='Proportion of All Posts',
caption = "Max Woolf — minimaxir.com") +
theme(legend.position = 'top',
legend.title = element_text(size = 6),
legend.text = element_text(size = 5),
legend.key.width = unit(1, unit='cm'),
legend.key.height = unit(0.25, unit='cm'),
legend.margin = margin(c(0, 0, -0.3, 0), unit='cm'),
axis.text.x = element_blank(),
axis.text.y = element_text(size = 5))
ggsave('hn_prop.png', plot, width=5, height=3)
BigQuery:
#standardSQL
SELECT * FROM (
SELECT *,
ROW_NUMBER() OVER (PARTITION BY subreddit ORDER BY perc_75 DESC) entry_num
FROM (
SELECT word, nextword,
subreddit,
COUNT(*) as num_instances,
ROUND(AVG(score)) as avg_score,
perc_25, perc_50, perc_75
FROM (
SELECT *,
PERCENTILE_CONT(score, 0.25) OVER (PARTITION BY subreddit, word, nextword) as perc_25,
PERCENTILE_CONT(score, 0.50) OVER (PARTITION BY subreddit, word, nextword) as perc_50,
PERCENTILE_CONT(score, 0.75) OVER (PARTITION BY subreddit, word, nextword) as perc_75
FROM (
SELECT word,
LEAD(word) OVER(PARTITION BY id ORDER BY pos) nextword,
score,
subreddit
FROM (
SELECT SPLIT(LOWER(REGEXP_REPLACE(title, r'[^a-zA-Z0-9 \'\"]', '')), " ") as words, id, score, subreddit
FROM `fh-bigquery.reddit_posts.*`
WHERE _TABLE_SUFFIX BETWEEN "2017_01" AND "2018_05"
AND subreddit IN (
SELECT subreddit
FROM `fh-bigquery.reddit_posts.*`
WHERE _TABLE_SUFFIX BETWEEN "2017_01" AND "2018_05"
GROUP BY subreddit
ORDER BY APPROX_COUNT_DISTINCT(author) DESC
LIMIT 100
)
), UNNEST(words) word
WITH OFFSET as pos
)
WHERE nextword IS NOT NULL
AND nextword != ''
AND word IS NOT NULL
AND word != ''
)
GROUP BY word, nextword, subreddit, perc_25, perc_50, perc_75
HAVING num_instances >= 100
)
)
WHERE entry_num <= 50
ORDER BY subreddit, entry_num
query <- '
#standardSQL
SELECT * FROM (
SELECT *,
ROW_NUMBER() OVER (PARTITION BY subreddit ORDER BY perc_75 DESC) entry_num
FROM (
SELECT word, nextword,
subreddit,
COUNT(*) as num_instances,
ROUND(AVG(score)) as avg_score,
perc_25, perc_50, perc_75
FROM (
SELECT *,
PERCENTILE_CONT(score, 0.25) OVER (PARTITION BY subreddit, word, nextword) as perc_25,
PERCENTILE_CONT(score, 0.50) OVER (PARTITION BY subreddit, word, nextword) as perc_50,
PERCENTILE_CONT(score, 0.75) OVER (PARTITION BY subreddit, word, nextword) as perc_75
FROM (
SELECT word,
LEAD(word) OVER(PARTITION BY id ORDER BY pos) nextword,
score,
subreddit
FROM (
SELECT SPLIT(LOWER(REGEXP_REPLACE(title, "[^a-zA-Z0-9 \\\'\\\"]", "")), " ") as words, id, score, subreddit
FROM `fh-bigquery.reddit_posts.*`
WHERE _TABLE_SUFFIX BETWEEN "2017_01" AND "2018_05"
AND subreddit IN (
SELECT subreddit
FROM `fh-bigquery.reddit_posts.*`
WHERE _TABLE_SUFFIX BETWEEN "2017_01" AND "2018_05"
GROUP BY subreddit
ORDER BY APPROX_COUNT_DISTINCT(author) DESC
LIMIT 100
)
), UNNEST(words) word
WITH OFFSET as pos
)
WHERE nextword IS NOT NULL
AND nextword != ""
AND word IS NOT NULL
AND word != ""
)
GROUP BY word, nextword, subreddit, perc_25, perc_50, perc_75
HAVING num_instances >= 100
)
)
WHERE entry_num <= 50
ORDER BY subreddit, entry_num
'
reddit_bigrams <- bq_project_query(project_id, query, use_legacy_sql=F) %>%
bq_table_download()
Running job [-] 1s
Running job [\] 1s
Running job [|] 2s
Running job [/] 2s
Running job [-] 3s
Running job [\] 3s
Running job [|] 3s
Running job [/] 3s
Running job [-] 4s
Running job [\] 4s
Running job [|] 5s
Running job [/] 5s
Running job [-] 5s
Running job [\] 5s
Running job [|] 6s
Running job [/] 6s
Running job [-] 7s
Running job [\] 7s
Running job [|] 7s
Running job [/] 7s
Running job [-] 8s
Running job [\] 8s
Running job [|] 9s
Running job [/] 9s
Running job [-] 9s
Running job [\] 9s
Running job [|] 10s
Running job [/] 10s
Running job [-] 10s
Running job [\] 11s
Running job [|] 11s
Running job [/] 11s
Running job [-] 12s
Running job [\] 12s
Running job [|] 12s
Running job [/] 13s
Running job [-] 13s
Running job [\] 13s
Running job [|] 14s
Running job [/] 14s
Running job [-] 14s
Running job [\] 14s
Running job [|] 15s
Running job [/] 15s
Running job [-] 16s
Running job [\] 16s
Running job [|] 16s
Running job [/] 16s
Running job [-] 17s
Running job [\] 17s
Running job [|] 18s
Running job [/] 18s
Running job [-] 18s
Running job [\] 18s
Running job [|] 19s
Running job [/] 19s
Running job [-] 19s
Running job [\] 20s
Running job [|] 20s
Running job [/] 20s
Running job [-] 21s
Running job [\] 21s
Running job [|] 21s
Running job [/] 22s
Running job [-] 22s
Running job [\] 22s
Running job [|] 23s
Running job [/] 23s
Running job [-] 23s
Running job [\] 24s
Running job [|] 24s
Running job [/] 24s
Running job [-] 25s
Running job [\] 25s
Running job [|] 25s
Running job [/] 26s
Running job [-] 26s
Running job [\] 26s
Running job [|] 27s
Running job [/] 27s
Running job [-] 27s
Running job [\] 27s
Running job [|] 28s
Running job [/] 28s
Running job [-] 29s
Running job [\] 29s
Running job [|] 29s
Running job [/] 29s
Running job [-] 30s
Running job [\] 30s
Running job [|] 31s
Running job [/] 31s
Running job [-] 31s
Running job [\] 31s
Running job [|] 32s
Running job [/] 32s
Running job [-] 33s
Running job [\] 33s
Running job [|] 33s
Running job [/] 33s
Running job [-] 34s
Running job [\] 34s
Running job [|] 34s
Running job [/] 35s
Running job [-] 35s
Running job [\] 35s
Running job [|] 36s
Running job [/] 36s
Running job [-] 36s
Running job [\] 37s
Running job [|] 37s
Running job [/] 37s
Running job [-] 38s
Running job [\] 38s
Running job [|] 38s
Running job [/] 39s
Running job [-] 39s
Running job [\] 39s
Running job [|] 40s
Running job [/] 40s
Running job [-] 40s
Running job [\] 41s
Running job [|] 41s
Running job [/] 41s
Running job [-] 42s
Running job [\] 42s
Running job [|] 42s
Running job [/] 42s
Running job [-] 43s
Running job [\] 43s
Running job [|] 44s
Running job [/] 44s
Running job [-] 44s
Running job [\] 44s
Running job [|] 45s
Running job [/] 45s
Running job [-] 46s
Running job [\] 46s
Running job [|] 46s
Running job [/] 46s
Running job [-] 47s
Running job [\] 47s
Running job [|] 47s
Running job [/] 48s
Running job [-] 48s
Running job [\] 48s
Running job [|] 49s
Running job [/] 49s
Running job [-] 49s
Running job [\] 50s
Running job [|] 1m
Running job [/] 1m
Running job [-] 1m
Running job [\] 1m
Running job [|] 1m
Running job [/] 1m
Running job [-] 1m
Running job [\] 1m
Running job [|] 1m
Running job [/] 1m
Running job [-] 1m
Running job [\] 1m
Running job [|] 1m
Running job [/] 1m
Running job [-] 1m
Running job [\] 1m
Running job [|] 1m
Running job [/] 1m
Running job [-] 1m
Running job [\] 1m
Running job [|] 1m
Running job [/] 1m
Running job [-] 1m
Running job [\] 1m
Running job [|] 1m
Running job [/] 1m
Running job [-] 1m
Running job [\] 1m
Running job [|] 1m
Running job [/] 1m
Running job [-] 1m
Running job [\] 1m
Running job [|] 1m
Running job [/] 1m
Running job [-] 1m
Complete
Billed: 17.59 GB
Downloading 4,944 rows in 1 pages.
Downloading data [=============================================================] 100% ETA: 0s
Parsing [===========================================================================] ETA: 0s
reddit_bigrams %>% head()
Filter out rows where both parts of the bigram are common/stop words.
stopwords <- tidytext::get_stopwords() %>% pull(word)
reddit_bigrams_filtered <- reddit_bigrams %>%
filter(!((word %in% stopwords) & (nextword %in% stopwords))) %>%
mutate(bigram = paste(word, nextword))
Visualize only the top 10 bigrams for each subreddit (for readability)
This adapts a ggplot2 facet trick by Simon Jackson.
sub_exclude <- c("me_irl", "explainlikeimfive", "Fireteams", "NoStupidQuestions", "GlobalOffensive", "GlobalOffensiveTrade", "RocketLeagueExchange", "leagueoflegends", "CircleofTrust", "PUBATTLEGROUNDS", "dirtykikpals", "gonewild")
top_subreddits <- df_top_subreddits %>%
filter(!(subreddit %in% sub_exclude)) %>%
head(24) %>%
pull(subreddit)
reddit_bigrams_sub <- reddit_bigrams_filtered %>%
filter(subreddit %in% top_subreddits) %>%
group_by(subreddit) %>%
top_n(10, perc_75) %>%
ungroup() %>%
arrange(subreddit, perc_75) %>%
mutate(order = row_number())
plot <- ggplot(reddit_bigrams_sub, aes(x=order, y=perc_75, fill=subreddit)) +
geom_bar(stat="identity") +
scale_x_continuous(
breaks = reddit_bigrams_sub$order,
labels = reddit_bigrams_sub$bigram
) +
scale_y_continuous(labels=comma) +
scale_fill_hue(l=50, guide=F) +
coord_flip() +
facet_wrap(~ subreddit, nrow=6, ncol=4, scales="free") +
labs(title='Title Bigram Importance of Reddit Submissions For 24 Top Subreddits',
subtitle='For Posts Made January 2017 to May 2018. Minimum 100 occurences of each bigram. Excludes bigrams where both words are stop words.',
x='Bigram',
y='75th Score Percentile for Submissions Containing Bigram on Subreddit',
fill='Median Score for Posts\nMade at Time on Subreddit',
caption = "Max Woolf — minimaxir.com") +
theme(axis.text.x = element_text(size = 6),
axis.text.y = element_text(size = 6),
axis.title.y = element_blank(),
panel.grid.minor = element_blank())
ggsave('reddit_subreddit_topbigrams.png', plot, width=6, height=9)
BigQuery:
#standardSQL
SELECT
subreddit,
score,
COUNT(*) AS num_submissions
FROM
`fh-bigquery.reddit_posts.*`
WHERE
_TABLE_SUFFIX BETWEEN "2017_01"
AND "2018_05"
AND subreddit IN (
SELECT
subreddit
FROM
`fh-bigquery.reddit_posts.*`
WHERE
_TABLE_SUFFIX BETWEEN "2017_01"
AND "2018_05"
GROUP BY
subreddit
ORDER BY
APPROX_COUNT_DISTINCT(author) DESC
LIMIT
100 )
AND score <= 100
GROUP BY
subreddit,
score
ORDER BY
subreddit,
score
query <- '
#standardSQL
SELECT
subreddit,
score,
COUNT(*) AS num_submissions
FROM
`fh-bigquery.reddit_posts.*`
WHERE
_TABLE_SUFFIX BETWEEN "2017_01"
AND "2018_05"
AND subreddit IN (
SELECT
subreddit
FROM
`fh-bigquery.reddit_posts.*`
WHERE
_TABLE_SUFFIX BETWEEN "2017_01"
AND "2018_05"
GROUP BY
subreddit
ORDER BY
APPROX_COUNT_DISTINCT(author) DESC
LIMIT
100 )
AND score <= 100
GROUP BY
subreddit,
score
ORDER BY
subreddit,
score
'
reddit_dist <- bq_project_query(project_id, query, use_legacy_sql=F) %>%
bq_table_download() %>%
left_join(df_top_subreddits) %>%
mutate(prop = num_submissions / total_posts)
Running job [-] 1s
Running job [\] 1s
Running job [|] 2s
Running job [/] 2s
Running job [-] 3s
Running job [\] 3s
Running job [|] 3s
Running job [/] 3s
Running job [-] 4s
Running job [\] 4s
Running job [|] 5s
Running job [/] 5s
Running job [-] 5s
Running job [\] 5s
Running job [|] 6s
Running job [/] 6s
Running job [-] 7s
Running job [\] 7s
Running job [|] 7s
Running job [/] 7s
Running job [-] 8s
Running job [\] 8s
Running job [|] 9s
Running job [/] 9s
Running job [-] 9s
Running job [\] 9s
Running job [|] 10s
Running job [/] 10s
Running job [-] 11s
Running job [\] 11s
Running job [|] 11s
Running job [/] 11s
Running job [-] 12s
Running job [\] 12s
Running job [|] 12s
Running job [/] 13s
Running job [-] 13s
Running job [\] 13s
Running job [|] 14s
Running job [/] 14s
Running job [-] 14s
Running job [\] 15s
Running job [|] 15s
Running job [/] 15s
Running job [-] 16s
Running job [\] 16s
Running job [|] 16s
Running job [/] 17s
Running job [-] 17s
Complete
Billed: 5.87 GB
Downloading 10,043 rows in 2 pages.
Parsing [======================================-------------------------------------] ETA: 0s
Parsing [===========================================================================] ETA: 0s
Joining, by = "subreddit"
reddit_dist %>% head()
# sub_exclude <- c("The_Donald", "PrequelMemes", "aww", "cats", "CrappyDesign", "politics")
sub_exclude <- c()
top_subreddits <- df_top_subreddits %>%
filter(!(subreddit %in% sub_exclude)) %>%
head(50) %>%
pull(subreddit)
plot <- ggplot(reddit_dist %>% filter(subreddit %in% top_subreddits), aes(x=score, y=fct_rev(subreddit), height=prop, fill=fct_rev(subreddit))) +
geom_density_ridges(stat = 'identity', scale=1.5, size=0.25) +
scale_x_continuous(breaks=c(0, 1, seq(10, 20, 10))) +
scale_y_discrete() +
scale_fill_hue(l = 50, guide=F) +
labs(title='Density Distribution of Score of Reddit Submissions\nFor 50 Top Subreddits',
subtitle='For Posts Made January 2017 to May 2018.\nTop Subreddits determined by # of unique submitters.',
x='Score of Reddit Post',
y='Day of Week Reddit Post Was Made',
fill='Median Score for Posts\nMade at Time on Subreddit',
caption = "Max Woolf — minimaxir.com") +
theme(
axis.text.y = element_text(size = 5),
axis.title.y = element_blank(),
panel.grid.minor = element_blank())
ggsave('reddit_dist.png', plot, width=4, height=6)
Another approach: faceted bar chart (may work better since the violin plot is hard to read)
sub_exclude <- c("business", "dirtykikpals", "gonewild", "CircleofTrust", "news", "technology")
top_subreddits <- df_top_subreddits %>%
filter(!(subreddit %in% sub_exclude)) %>%
head(50) %>%
pull(subreddit)
plot <- ggplot(reddit_dist %>% filter(subreddit %in% top_subreddits), aes(x=score, y=prop, fill=subreddit)) +
geom_bar(stat = 'identity') +
scale_x_continuous(breaks=c(0, 1, 5, 10), limits=c(NA, 11)) +
scale_y_continuous(labels=percent_format(accuracy=1), breaks=pretty_breaks(4)) +
scale_fill_hue(l = 50, guide=F) +
facet_wrap(~ subreddit, ncol=5, scales="free") +
labs(title='Score Distribution of Reddit Submissions For 50 Top Subreddits',
subtitle='For Posts Made January 2017 to May 2018. Top Subreddits determined by # of unique submitters.',
x='Score of Reddit Post',
y='% of All Submissions on Subreddit',
fill='Median Score for Posts\nMade at Time on Subreddit',
caption = "Max Woolf — minimaxir.com") +
theme(
axis.text.y = element_text(size = 5),
axis.text.x = element_text(size = 6),
panel.grid.minor = element_blank())
ggsave('reddit_dist_facet.png', plot, width=6, height=8)
The MIT License (MIT)
Copyright (c) 2018 Max Woolf
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.