This R Notebook is the complement to my blog post Problems with Predicting Post Performance on Reddit and Other Link Aggregators.

This notebook is licensed under the MIT License. If you use the code or data visualization designs contained within this notebook, it would be greatly appreciated if proper attribution is given back to this notebook and/or myself. Thanks! :)

library(tidyverse)
library(scales)
library(ggridges)
library(bigrquery)
sessionInfo()
R version 3.5.0 (2018-04-23)
Platform: x86_64-apple-darwin15.6.0 (64-bit)
Running under: macOS High Sierra 10.13.6

Matrix products: default
BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] bigrquery_1.0.0 ggridges_0.5.0  scales_1.0.0    forcats_0.3.0   stringr_1.3.1  
 [6] dplyr_0.7.6     purrr_0.2.5     readr_1.1.1     tidyr_0.8.1     tibble_1.4.2   
[11] ggplot2_3.0.0   tidyverse_1.2.1

loaded via a namespace (and not attached):
 [1] Rcpp_0.12.18     cellranger_1.1.0 pillar_1.3.0     compiler_3.5.0   plyr_1.8.4      
 [6] bindr_0.1.1      tools_3.5.0      jsonlite_1.5     lubridate_1.7.4  nlme_3.1-137    
[11] gtable_0.2.0     lattice_0.20-35  pkgconfig_2.0.2  rlang_0.2.2      DBI_1.0.0       
[16] cli_1.0.0        rstudioapi_0.7   yaml_2.2.0       haven_1.1.2      bindrcpp_0.2.2  
[21] withr_2.1.2      xml2_1.2.0       httr_1.3.1       knitr_1.20       hms_0.4.2       
[26] grid_3.5.0       tidyselect_0.2.4 glue_1.3.0       R6_2.2.2         readxl_1.1.0    
[31] modelr_0.1.2     magrittr_1.5     backports_1.1.2  rvest_0.3.2      assertthat_0.2.0
[36] colorspace_1.3-2 stringi_1.2.4    lazyeval_0.2.1   munsell_0.5.0    broom_0.5.0     
[41] crayon_1.3.4    
theme_set(theme_minimal(base_size=9, base_family="Source Sans Pro") +
            theme(plot.title = element_text(size=8, family="Source Sans Pro Bold", margin=margin(t = -0.1, b = 0.1, unit='cm')),
                  axis.title.x = element_text(size=8),
                  axis.title.y = element_text(size=8),
                  plot.subtitle = element_text(family="Source Sans Pro Semibold", color="#969696", size=6),
                  plot.caption = element_text(size=6, color="#969696"),
                  legend.title = element_text(size=8),
                  legend.key.width = unit(0.25, unit='cm')))

BigQuery Project ID (change to your own)

project_id <- "poetic-analog-126704"

1 Post Distribution By Hour/Day-Of-Week

1.1 Reddit

BigQuery:

#standardSQL
SELECT
subreddit,
post_hour,
post_weekday,
COUNT(*) as num_instances,
ROUND(AVG(score)) as avg_score,
perc_25, perc_50, perc_75
FROM (
  SELECT *,
  PERCENTILE_CONT(score, 0.25) OVER (PARTITION BY subreddit, post_hour, post_weekday) as perc_25,
  PERCENTILE_CONT(score, 0.50) OVER (PARTITION BY subreddit, post_hour, post_weekday) as perc_50,
  PERCENTILE_CONT(score, 0.75) OVER (PARTITION BY subreddit, post_hour, post_weekday) as perc_75
  FROM (
    SELECT 
    EXTRACT(HOUR FROM TIMESTAMP_SECONDS(created_utc) AT TIME ZONE "America/New_York") as post_hour,
    EXTRACT(DAYOFWEEK FROM TIMESTAMP_SECONDS(created_utc) AT TIME ZONE "America/New_York") as post_weekday,
    subreddit,
    score
    FROM `fh-bigquery.reddit_posts.*`
    WHERE _TABLE_SUFFIX BETWEEN "2017_01" AND "2018_05"
        AND subreddit IN (
      SELECT subreddit
      FROM `fh-bigquery.reddit_posts.*`
      WHERE _TABLE_SUFFIX BETWEEN "2017_01" AND "2018_05"
      GROUP BY subreddit
      ORDER BY APPROX_COUNT_DISTINCT(author) DESC
      LIMIT 100
    )
)
)
GROUP BY subreddit, post_hour, post_weekday, perc_25, perc_50, perc_75
ORDER BY subreddit, post_hour, post_weekday
query <- '
#standardSQL
SELECT
subreddit,
post_hour,
post_weekday,
COUNT(*) as num_instances,
ROUND(AVG(score)) as avg_score,
perc_25, perc_50, perc_75
FROM (
  SELECT *,
  PERCENTILE_CONT(score, 0.25) OVER (PARTITION BY subreddit, post_hour, post_weekday) as perc_25,
  PERCENTILE_CONT(score, 0.50) OVER (PARTITION BY subreddit, post_hour, post_weekday) as perc_50,
  PERCENTILE_CONT(score, 0.75) OVER (PARTITION BY subreddit, post_hour, post_weekday) as perc_75
  FROM (
    SELECT 
    EXTRACT(HOUR FROM TIMESTAMP_SECONDS(created_utc) AT TIME ZONE "America/New_York") as post_hour,
    EXTRACT(DAYOFWEEK FROM TIMESTAMP_SECONDS(created_utc) AT TIME ZONE "America/New_York") as post_weekday,
    subreddit,
    score
    FROM `fh-bigquery.reddit_posts.*`
    WHERE _TABLE_SUFFIX BETWEEN "2017_01" AND "2018_05"
        AND subreddit IN (
      SELECT subreddit
      FROM `fh-bigquery.reddit_posts.*`
      WHERE _TABLE_SUFFIX BETWEEN "2017_01" AND "2018_05"
      GROUP BY subreddit
      ORDER BY APPROX_COUNT_DISTINCT(author) DESC
      LIMIT 100
    )
)
)
GROUP BY subreddit, post_hour, post_weekday, perc_25, perc_50, perc_75
ORDER BY subreddit, post_hour, post_weekday
'
df_reddit_hour_doy <- bq_project_query(project_id, query, use_legacy_sql=F) %>%
                        bq_table_download()

Running job [-]  1s
Running job [\]  1s
Running job [|]  2s
Running job [/]  2s
Running job [-]  3s
Running job [\]  3s
Running job [|]  3s
Running job [/]  3s
Running job [-]  4s
Running job [\]  4s
Running job [|]  5s
Running job [/]  5s
Running job [-]  5s
Running job [\]  5s
Running job [|]  6s
Running job [/]  6s
Running job [-]  7s
Running job [\]  7s
Running job [|]  7s
Running job [/]  7s
Running job [-]  8s
Running job [\]  8s
Running job [|]  9s
Running job [/]  9s
Running job [-]  9s
Running job [\]  9s
Running job [|] 10s
Running job [/] 10s
Running job [-] 11s
Running job [\] 11s
Running job [|] 11s
Running job [/] 11s
Running job [-] 12s
Running job [\] 12s
Running job [|] 12s
Running job [/] 13s
Running job [-] 13s
Running job [\] 13s
Running job [|] 14s
Running job [/] 14s
Running job [-] 14s
Running job [\] 15s
Running job [|] 15s
Running job [/] 15s
Running job [-] 16s
Running job [\] 16s
Running job [|] 16s
Running job [/] 17s
Running job [-] 17s
Running job [\] 17s
Running job [|] 18s
Running job [/] 18s
Running job [-] 18s
Running job [\] 19s
Running job [|] 19s
Running job [/] 19s
Running job [-] 20s
Running job [\] 20s
Running job [|] 20s
Running job [/] 20s
Running job [-] 21s
Running job [\] 21s
Running job [|] 22s
Running job [/] 22s
Running job [-] 22s
Complete
Billed: 7.27 GB
Downloading 16,771 rows in 2 pages.

Downloading data [=============================================================] 100% ETA:  0s
                                                                                              

Parsing [======================================-------------------------------------] ETA:  0s
Parsing [===========================================================================] ETA:  0s
                                                                                              
df_reddit_hour_doy %>% head()

Mutate the numeric hour/day-of-week into named factors.

  • Hours are from 0-23, 12 AM is 0
  • Days-of-week are from 1-7, 1 is Sunday
hour_labels <- c(paste(c(12, 1:11), "AM"), paste(c(12, 1:11), "PM"))
print(hour_labels)
 [1] "12 AM" "1 AM"  "2 AM"  "3 AM"  "4 AM"  "5 AM"  "6 AM"  "7 AM"  "8 AM"  "9 AM"  "10 AM"
[12] "11 AM" "12 PM" "1 PM"  "2 PM"  "3 PM"  "4 PM"  "5 PM"  "6 PM"  "7 PM"  "8 PM"  "9 PM" 
[23] "10 PM" "11 PM"
doy_labels <- c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday")
df_reddit_hour_doy <- df_reddit_hour_doy %>%
                        mutate(post_hour = factor(post_hour, labels=hour_labels),
                               post_weekday = factor(post_weekday, labels=doy_labels))
df_reddit_hour_doy %>% mutate()