This R Notebook is the complement to my blog post Predicting the Success of a Reddit Submission with Deep Learning and Keras.

This notebook is licensed under the MIT License. If you use the code or data visualization designs contained within this notebook, it would be greatly appreciated if proper attribution is given back to this notebook and/or myself. Thanks! :)

1 Setup

library(readr)
library(dplyr)

Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union
library(ggplot2)
library(scales)

Attaching package: ‘scales’

The following object is masked from ‘package:readr’:

    col_factor
library(tidyr)
library(tsne)
library(viridis)
Loading required package: viridisLite
sessionInfo()
R version 3.4.0 (2017-04-21)
Platform: x86_64-apple-darwin15.6.0 (64-bit)
Running under: macOS Sierra 10.12.5

Matrix products: default
BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] viridis_0.4.0      viridisLite_0.2.0  tsne_0.1-3         tidyr_0.6.3       
[5] scales_0.4.1       ggplot2_2.2.1.9000 dplyr_0.7.0        readr_1.1.1       

loaded via a namespace (and not attached):
 [1] Rcpp_0.12.11     knitr_1.16       magrittr_1.5     hms_0.3          munsell_0.4.3   
 [6] colorspace_1.3-2 R6_2.2.2         rlang_0.1.1      plyr_1.8.4       stringr_1.2.0   
[11] tools_3.4.0      grid_3.4.0       gtable_0.2.0     htmltools_0.3.6  lazyeval_0.2.0  
[16] yaml_2.1.14      rprojroot_1.2    digest_0.6.12    assertthat_0.2.0 tibble_1.3.3    
[21] gridExtra_2.2.1  base64enc_0.1-3  glue_1.1.0       evaluate_0.10    rmarkdown_1.6   
[26] stringi_1.1.5    compiler_3.4.0   backports_1.1.0  jsonlite_1.5    

Load data.

df_training <- read_csv("training.csv")
Parsed with column specification:
cols(
  epoch = col_integer(),
  aux_out_acc = col_double(),
  aux_out_loss = col_double(),
  loss = col_double(),
  main_out_acc = col_double(),
  main_out_loss = col_double(),
  val_aux_out_acc = col_double(),
  val_aux_out_loss = col_double(),
  val_loss = col_double(),
  val_main_out_acc = col_double(),
  val_main_out_loss = col_double()
)
df_dayofweeks <- read_delim("dayofweeks_embeddings.txt", col_names = F, delim=" ", quote = "—")
Parsed with column specification:
cols(
  .default = col_double(),
  X1 = col_integer()
)
See spec(...) for full column specifications.
df_dayofyears <- read_delim("dayofyears_embeddings.txt", col_names = F, delim=" ", quote = "—")
Parsed with column specification:
cols(
  .default = col_double(),
  X1 = col_integer()
)
See spec(...) for full column specifications.
df_hours <- read_delim("hours_embeddings.txt", col_names = F, delim=" ", quote = "—")
Parsed with column specification:
cols(
  .default = col_double(),
  X1 = col_integer()
)
See spec(...) for full column specifications.
df_minutes <- read_delim("minutes_embeddings.txt", col_names = F, delim=" ", quote = "—")
Parsed with column specification:
cols(
  .default = col_double(),
  X1 = col_integer()
)
See spec(...) for full column specifications.

2 Plot Model Training

# Test Accuracy must be better than this.
no_information_rate <- 0.64137998126

Tidy dataframe to plot Training Accuracy vs. Test Accuracy

df_training_tf <- df_training %>%
                    mutate(epoch = epoch + 1) %>%
                    gather(type, perc, main_out_acc, val_main_out_acc)
plot <- ggplot(df_training_tf, aes(epoch, perc, color=type)) +
          geom_line() +
          geom_point(size=1, fill="white", shape=21) +
          geom_hline(yintercept=no_information_rate, linetype="dashed") +
          scale_x_continuous(breaks=seq(1,20)) +
          scale_y_continuous(labels=percent, limits=c(0.60, 0.75)) +
          scale_color_manual(values=c("#2980b9", "#27ae60"), labels=c("Training Data (80%)", "Test Data (20%)")) +
          labs(title = "Accuracy of Reddit Predictive Model During Training",
               x = "# Epoch",
               y = "Accuracy of Model on Dataset",
               caption = "Max Woolf — minimaxir.com",
               color='') + 
          theme_minimal(base_size=9, base_family="Source Sans Pro") +
            theme(plot.title = element_text(size=11, family="Source Sans Pro Bold"),
                  axis.title.x = element_text(family="Source Sans Pro Semibold"),
                  axis.title.y = element_text(family="Source Sans Pro Semibold"),
                  plot.caption = element_text(size=6, color="#969696"),
                  legend.position="top", legend.margin=margin(t = -0.1, b = -0.25, unit='cm'),
                  panel.grid.minor = element_blank())
ggsave("predict-reddit-1.png", plot, width=4, height=3)