This R Notebook is the complement to my blog post Predicting the Success of a Reddit Submission with Deep Learning and Keras.

This notebook is licensed under the MIT License. If you use the code or data visualization designs contained within this notebook, it would be greatly appreciated if proper attribution is given back to this notebook and/or myself. Thanks! :)

1 Setup

library(readr)
library(dplyr)

Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union
library(ggplot2)
library(scales)

Attaching package: ‘scales’

The following object is masked from ‘package:readr’:

    col_factor
library(tidyr)
library(tsne)
library(viridis)
Loading required package: viridisLite
sessionInfo()
R version 3.4.0 (2017-04-21)
Platform: x86_64-apple-darwin15.6.0 (64-bit)
Running under: macOS Sierra 10.12.5

Matrix products: default
BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] viridis_0.4.0      viridisLite_0.2.0  tsne_0.1-3         tidyr_0.6.3       
[5] scales_0.4.1       ggplot2_2.2.1.9000 dplyr_0.7.0        readr_1.1.1       

loaded via a namespace (and not attached):
 [1] Rcpp_0.12.11     knitr_1.16       magrittr_1.5     hms_0.3          munsell_0.4.3   
 [6] colorspace_1.3-2 R6_2.2.2         rlang_0.1.1      plyr_1.8.4       stringr_1.2.0   
[11] tools_3.4.0      grid_3.4.0       gtable_0.2.0     htmltools_0.3.6  lazyeval_0.2.0  
[16] yaml_2.1.14      rprojroot_1.2    digest_0.6.12    assertthat_0.2.0 tibble_1.3.3    
[21] gridExtra_2.2.1  base64enc_0.1-3  glue_1.1.0       evaluate_0.10    rmarkdown_1.6   
[26] stringi_1.1.5    compiler_3.4.0   backports_1.1.0  jsonlite_1.5    

Load data.

df_training <- read_csv("training.csv")
Parsed with column specification:
cols(
  epoch = col_integer(),
  aux_out_acc = col_double(),
  aux_out_loss = col_double(),
  loss = col_double(),
  main_out_acc = col_double(),
  main_out_loss = col_double(),
  val_aux_out_acc = col_double(),
  val_aux_out_loss = col_double(),
  val_loss = col_double(),
  val_main_out_acc = col_double(),
  val_main_out_loss = col_double()
)
df_dayofweeks <- read_delim("dayofweeks_embeddings.txt", col_names = F, delim=" ", quote = "—")
Parsed with column specification:
cols(
  .default = col_double(),
  X1 = col_integer()
)
See spec(...) for full column specifications.
df_dayofyears <- read_delim("dayofyears_embeddings.txt", col_names = F, delim=" ", quote = "—")
Parsed with column specification:
cols(
  .default = col_double(),
  X1 = col_integer()
)
See spec(...) for full column specifications.
df_hours <- read_delim("hours_embeddings.txt", col_names = F, delim=" ", quote = "—")
Parsed with column specification:
cols(
  .default = col_double(),
  X1 = col_integer()
)
See spec(...) for full column specifications.
df_minutes <- read_delim("minutes_embeddings.txt", col_names = F, delim=" ", quote = "—")
Parsed with column specification:
cols(
  .default = col_double(),
  X1 = col_integer()
)
See spec(...) for full column specifications.

2 Plot Model Training

# Test Accuracy must be better than this.
no_information_rate <- 0.64137998126

Tidy dataframe to plot Training Accuracy vs. Test Accuracy

df_training_tf <- df_training %>%
                    mutate(epoch = epoch + 1) %>%
                    gather(type, perc, main_out_acc, val_main_out_acc)
plot <- ggplot(df_training_tf, aes(epoch, perc, color=type)) +
          geom_line() +
          geom_point(size=1, fill="white", shape=21) +
          geom_hline(yintercept=no_information_rate, linetype="dashed") +
          scale_x_continuous(breaks=seq(1,20)) +
          scale_y_continuous(labels=percent, limits=c(0.60, 0.75)) +
          scale_color_manual(values=c("#2980b9", "#27ae60"), labels=c("Training Data (80%)", "Test Data (20%)")) +
          labs(title = "Accuracy of Reddit Predictive Model During Training",
               x = "# Epoch",
               y = "Accuracy of Model on Dataset",
               caption = "Max Woolf — minimaxir.com",
               color='') + 
          theme_minimal(base_size=9, base_family="Source Sans Pro") +
            theme(plot.title = element_text(size=11, family="Source Sans Pro Bold"),
                  axis.title.x = element_text(family="Source Sans Pro Semibold"),
                  axis.title.y = element_text(family="Source Sans Pro Semibold"),
                  plot.caption = element_text(size=6, color="#969696"),
                  legend.position="top", legend.margin=margin(t = -0.1, b = -0.25, unit='cm'),
                  panel.grid.minor = element_blank())
ggsave("predict-reddit-1.png", plot, width=4, height=3)

3 Plot Embeddings

3.1 Day of Week

perplexity = 0
initial_dims = 64
max_iter = 500
set.seed(123)
labels = c("Sunday","Monday","Tuesday","Wednesday","Thursday","Friday","Saturday")
df_dayofweeks_tf <- df_dayofweeks %>% select(X2:X65)%>%
                  data.matrix() %>%
                  tsne(perplexity = perplexity, initial_dims = initial_dims, max_iter = max_iter)
sigma summary: Min. : 2.98023223876953e-08 |1st Qu. : 2.98023223876953e-08 |Median : 2.98023223876953e-08 |Mean : 2.98023223876953e-08 |3rd Qu. : 2.98023223876953e-08 |Max. : 2.98023223876953e-08 |
Epoch: Iteration #100 error is: 26.5713344925457
Epoch: Iteration #200 error is: 4.70493865197945
Epoch: Iteration #300 error is: 4.55693227467146
Epoch: Iteration #400 error is: 4.52951916674413
Epoch: Iteration #500 error is: 4.52121147612488
df_dayofweeks_tf <- data.frame(labels, df_dayofweeks_tf) %>%
                    tbl_df() %>%
                    mutate(labels = factor(labels, levels=labels))
df_dayofweeks_tf
plot <- ggplot(df_dayofweeks_tf, aes(x=X1, y=X2, label=labels, color =labels)) +
          geom_text(family="Source Code Pro Semibold") +
          theme_void(base_family = "Source Sans Pro", base_size=8) +
          scale_color_viridis(discrete=T, guide=FALSE) + 
          scale_x_continuous(limits=c(-3000,3000)) +
          labs(title = "2D Projection of Day-of-Week Vectors in Reddit Prediction Model",
               subtitle = "Labels closer to each other are more similar in context.",
               caption = "Max Woolf — minimaxir.com") +
          theme(plot.margin = unit(c(0.2,0.2,0.2,0.2),"cm"),
                plot.caption = element_text(size=6, color="#969696"))
          
ggsave("predict-reddit-2.png", plot, width=4, height=3)

3.2 Day of Year

perplexity = 30
initial_dims = 64
max_iter = 1000
set.seed(123)
labels <- c(paste0("J", 1:31),paste0("F", 1:28), paste0("M", 1:30), paste0("A", 1:31))
df_dayofyears_tf <- df_dayofyears[1:120,] %>% select(X2:X65)%>%
                  data.matrix() %>%
                  tsne(perplexity = perplexity, initial_dims = initial_dims, max_iter = max_iter)
sigma summary: Min. : 0.480416040280304 |1st Qu. : 0.560327337197198 |Median : 0.587040943854772 |Mean : 0.593694099558859 |3rd Qu. : 0.61908473834691 |Max. : 0.723308153052258 |
Epoch: Iteration #100 error is: 21.4639805201705
Epoch: Iteration #200 error is: 1.53767284697986
Epoch: Iteration #300 error is: 1.40333924329121
Epoch: Iteration #400 error is: 1.34966958099461
Epoch: Iteration #500 error is: 1.3233932380174
Epoch: Iteration #600 error is: 1.3008324276279
Epoch: Iteration #700 error is: 1.2801281360194
Epoch: Iteration #800 error is: 1.26860393200743
Epoch: Iteration #900 error is: 1.26011985047295
Epoch: Iteration #1000 error is: 1.27375691896302
df_dayofyears_tf <- data.frame(labels, df_dayofyears_tf) %>%
                    tbl_df() %>%
                    mutate(labels = factor(labels, levels=labels))
df_dayofyears_tf
xscale = 200
yscale = 200
plot <- ggplot(df_dayofyears_tf, aes(x=X1, y=X2, label=labels, color = labels)) +
          geom_text(family="Source Code Pro Semibold", alpha=0.8, size=3) +
          theme_void(base_family = "Source Sans Pro", base_size=8) +
          scale_color_viridis(discrete=T, guide=FALSE) + 
          #scale_x_continuous(limits=c(-xscale, xscale)) +
          #scale_y_continuous(limits=c(-yscale, yscale)) +
          labs(title = "2D Projection of Day-of-Year Vectors in Reddit Prediction Model",
               subtitle = "Labels closer to each other are more similar in context.",
               caption = "Max Woolf — minimaxir.com") +
          theme(plot.margin = unit(c(0.2,0.2,0.2,0.2),"cm"),
                plot.caption = element_text(size=6, color="#969696"))
          
ggsave("predict-reddit-3.png", plot, width=4, height=3)

3.3 Hours

perplexity = 3
initial_dims = 64
max_iter = 500
set.seed(123)
labels <- c("12AM", paste0(1:11,"AM"), "12PM", paste0(1:11,"PM"))
df_hours_tf <- df_hours %>% select(X2:X65)%>%
                  data.matrix() %>%
                  tsne(perplexity = perplexity, initial_dims = initial_dims, max_iter = max_iter)
sigma summary: Min. : 0.0964258213154657 |1st Qu. : 0.0964258213154657 |Median : 0.0964258213154657 |Mean : 0.0964258213154657 |3rd Qu. : 0.0964258213154657 |Max. : 0.0964258213154657 |
Epoch: Iteration #100 error is: 17.5840702341876
Epoch: Iteration #200 error is: 2.02804584799264
Epoch: Iteration #300 error is: 1.8472176781026
Epoch: Iteration #400 error is: 1.66797086819175
Epoch: Iteration #500 error is: 1.61377144839723
df_hours_tf <- data.frame(labels, df_hours_tf) %>%
                    tbl_df() %>%
                    mutate(labels = factor(labels, levels=labels))
df_hours_tf
xscale = 200
yscale = 200

plot <- ggplot(df_hours_tf, aes(x=X1, y=X2, label=labels, color = labels)) +
          geom_text(family="Source Code Pro Semibold", size=4) +
          theme_void(base_family = "Source Sans Pro", base_size=8) +
          scale_color_viridis(discrete=T, guide=FALSE) + 
          #scale_x_continuous(limits=c(-xscale, xscale)) +
          #scale_y_continuous(limits=c(-yscale, yscale)) +
          labs(title = "2D Projection of Hour (EST) Vectors in Reddit Prediction Model",
               subtitle = "Labels closer to each other are more similar in context.",
               caption = "Max Woolf — minimaxir.com") +
          theme(plot.margin = unit(c(0.2,0.2,0.2,0.2),"cm"),
                plot.caption = element_text(size=6, color="#969696"))
          
ggsave("predict-reddit-4.png", plot, width=4, height=3)

3.4 Minutes

perplexity = 10
initial_dims = 32
max_iter = 500
set.seed(123)
labels <- sprintf("%02s", 0:59)
df_minutes_tf <- df_minutes %>% select(X2:X65)%>%
                  data.matrix() %>%
                  tsne(perplexity = perplexity, initial_dims = initial_dims, max_iter = max_iter)
sigma summary: Min. : 0.397146070967201 |1st Qu. : 0.487460184307814 |Median : 0.551576230267517 |Mean : 0.548482658375214 |3rd Qu. : 0.596540120304129 |Max. : 0.775330514026472 |
Epoch: Iteration #100 error is: 24.71663930486
Epoch: Iteration #200 error is: 2.59772682733714
Epoch: Iteration #300 error is: 1.89263899428677
Epoch: Iteration #400 error is: 1.68788012851525
Epoch: Iteration #500 error is: 1.54361828903496
df_minutes_tf <- data.frame(labels, df_minutes_tf) %>%
                    tbl_df() %>%
                    mutate(labels = factor(labels, levels=labels))
df_minutes_tf
xscale = 200
yscale = 200
plot <- ggplot(df_minutes_tf, aes(x=X1, y=X2, label=labels, color = labels)) +
          geom_text(family="Source Code Pro Semibold", size=4) +
          theme_void(base_family = "Source Sans Pro", base_size=8) +
          scale_color_viridis(discrete=T, guide=FALSE) + 
          #scale_x_continuous(limits=c(-xscale, xscale)) +
          #scale_y_continuous(limits=c(-yscale, yscale)) +
          labs(title = "2D Projection of Minute Vectors in Reddit Prediction Model",
               subtitle = "Labels closer to each other are more similar in context.",
               caption = "Max Woolf — minimaxir.com") +
          theme(plot.margin = unit(c(0.2,0.2,0.2,0.2),"cm"),
                plot.caption = element_text(size=6, color="#969696"))
          
ggsave("predict-reddit-5.png", plot, width=4, height=3)
---
title: "Predicting the Success of a Reddit Submission with Deep Learning and Keras"
author: "Max Woolf (@minimaxir)"
date: "2017-06-26"
output:
  html_notebook:
    highlight: tango
    mathjax: null
    number_sections: yes
    theme: spacelab
    toc: yes
---

This R Notebook is the complement to my blog post [Predicting the Success of a Reddit Submission with Deep Learning and Keras](http://minimaxir.com/2017/06/reddit-deep-learning/).

This notebook is licensed under the MIT License. If you use the code or data visualization designs contained within this notebook, it would be greatly appreciated if proper attribution is given back to this notebook and/or myself. Thanks! :)

# Setup

```{r}
library(readr)
library(dplyr)
library(ggplot2)
library(scales)
library(tidyr)
library(tsne)
library(viridis)

sessionInfo()
```

Load data.

```{r}
df_training <- read_csv("training.csv")
df_dayofweeks <- read_delim("dayofweeks_embeddings.txt", col_names = F, delim=" ", quote = "—")
df_dayofyears <- read_delim("dayofyears_embeddings.txt", col_names = F, delim=" ", quote = "—")
df_hours <- read_delim("hours_embeddings.txt", col_names = F, delim=" ", quote = "—")
df_minutes <- read_delim("minutes_embeddings.txt", col_names = F, delim=" ", quote = "—")
```

# Plot Model Training

```{r}
# Test Accuracy must be better than this.
no_information_rate <- 0.64137998126
```

Tidy dataframe to plot Training Accuracy vs. Test Accuracy

```{r}
df_training_tf <- df_training %>%
                    mutate(epoch = epoch + 1) %>%
                    gather(type, perc, main_out_acc, val_main_out_acc)

plot <- ggplot(df_training_tf, aes(epoch, perc, color=type)) +
          geom_line() +
          geom_point(size=1, fill="white", shape=21) +
          geom_hline(yintercept=no_information_rate, linetype="dashed") +
          scale_x_continuous(breaks=seq(1,20)) +
          scale_y_continuous(labels=percent, limits=c(0.60, 0.75)) +
          scale_color_manual(values=c("#2980b9", "#27ae60"), labels=c("Training Data (80%)", "Test Data (20%)")) +
          labs(title = "Accuracy of Reddit Predictive Model During Training",
               x = "# Epoch",
               y = "Accuracy of Model on Dataset",
               caption = "Max Woolf — minimaxir.com",
               color='') + 
          theme_minimal(base_size=9, base_family="Source Sans Pro") +
            theme(plot.title = element_text(size=11, family="Source Sans Pro Bold"),
                  axis.title.x = element_text(family="Source Sans Pro Semibold"),
                  axis.title.y = element_text(family="Source Sans Pro Semibold"),
                  plot.caption = element_text(size=6, color="#969696"),
                  legend.position="top", legend.margin=margin(t = -0.1, b = -0.25, unit='cm'),
                  panel.grid.minor = element_blank())

ggsave("predict-reddit-1.png", plot, width=4, height=3)
```

![](predict-reddit-1.png)

# Plot Embeddings

## Day of Week

```{r}
perplexity = 0
initial_dims = 64
max_iter = 500

set.seed(123)

labels = c("Sunday","Monday","Tuesday","Wednesday","Thursday","Friday","Saturday")

df_dayofweeks_tf <- df_dayofweeks %>% select(X2:X65)%>%
                  data.matrix() %>%
                  tsne(perplexity = perplexity, initial_dims = initial_dims, max_iter = max_iter)

df_dayofweeks_tf <- data.frame(labels, df_dayofweeks_tf) %>%
                    tbl_df() %>%
                    mutate(labels = factor(labels, levels=labels))

df_dayofweeks_tf
```

```{r}
plot <- ggplot(df_dayofweeks_tf, aes(x=X1, y=X2, label=labels, color =labels)) +
          geom_text(family="Source Code Pro Semibold") +
          theme_void(base_family = "Source Sans Pro", base_size=8) +
          scale_color_viridis(discrete=T, guide=FALSE) + 
          scale_x_continuous(limits=c(-3000,3000)) +
          labs(title = "2D Projection of Day-of-Week Vectors in Reddit Prediction Model",
               subtitle = "Labels closer to each other are more similar in context.",
               caption = "Max Woolf — minimaxir.com") +
          theme(plot.margin = unit(c(0.2,0.2,0.2,0.2),"cm"),
                plot.caption = element_text(size=6, color="#969696"))
          
ggsave("predict-reddit-2.png", plot, width=4, height=3)
```

![](predict-reddit-2.png)

## Day of Year

```{r}
perplexity = 30
initial_dims = 64
max_iter = 1000

set.seed(123)

labels <- c(paste0("J", 1:31),paste0("F", 1:28), paste0("M", 1:30), paste0("A", 1:31))

df_dayofyears_tf <- df_dayofyears[1:120,] %>% select(X2:X65)%>%
                  data.matrix() %>%
                  tsne(perplexity = perplexity, initial_dims = initial_dims, max_iter = max_iter)

df_dayofyears_tf <- data.frame(labels, df_dayofyears_tf) %>%
                    tbl_df() %>%
                    mutate(labels = factor(labels, levels=labels))

df_dayofyears_tf
```


```{r}
xscale = 200
yscale = 200

plot <- ggplot(df_dayofyears_tf, aes(x=X1, y=X2, label=labels, color = labels)) +
          geom_text(family="Source Code Pro Semibold", alpha=0.8, size=3) +
          theme_void(base_family = "Source Sans Pro", base_size=8) +
          scale_color_viridis(discrete=T, guide=FALSE) + 
          #scale_x_continuous(limits=c(-xscale, xscale)) +
          #scale_y_continuous(limits=c(-yscale, yscale)) +
          labs(title = "2D Projection of Day-of-Year Vectors in Reddit Prediction Model",
               subtitle = "Labels closer to each other are more similar in context.",
               caption = "Max Woolf — minimaxir.com") +
          theme(plot.margin = unit(c(0.2,0.2,0.2,0.2),"cm"),
                plot.caption = element_text(size=6, color="#969696"))
          
ggsave("predict-reddit-3.png", plot, width=4, height=3)
```

![](predict-reddit-3.png)

## Hours

```{r}
perplexity = 3
initial_dims = 64
max_iter = 500

set.seed(123)

labels <- c("12AM", paste0(1:11,"AM"), "12PM", paste0(1:11,"PM"))

df_hours_tf <- df_hours %>% select(X2:X65)%>%
                  data.matrix() %>%
                  tsne(perplexity = perplexity, initial_dims = initial_dims, max_iter = max_iter)

df_hours_tf <- data.frame(labels, df_hours_tf) %>%
                    tbl_df() %>%
                    mutate(labels = factor(labels, levels=labels))

df_hours_tf
```


```{r}
xscale = 200
yscale = 200

plot <- ggplot(df_hours_tf, aes(x=X1, y=X2, label=labels, color = labels)) +
          geom_text(family="Source Code Pro Semibold", size=4) +
          theme_void(base_family = "Source Sans Pro", base_size=8) +
          scale_color_viridis(discrete=T, guide=FALSE) + 
          #scale_x_continuous(limits=c(-xscale, xscale)) +
          #scale_y_continuous(limits=c(-yscale, yscale)) +
          labs(title = "2D Projection of Hour (EST) Vectors in Reddit Prediction Model",
               subtitle = "Labels closer to each other are more similar in context.",
               caption = "Max Woolf — minimaxir.com") +
          theme(plot.margin = unit(c(0.2,0.2,0.2,0.2),"cm"),
                plot.caption = element_text(size=6, color="#969696"))
          
ggsave("predict-reddit-4.png", plot, width=4, height=3)
```

![](predict-reddit-4.png)

## Minutes

```{r}
perplexity = 10
initial_dims = 32
max_iter = 500

set.seed(123)

labels <- sprintf("%02s", 0:59)

df_minutes_tf <- df_minutes %>% select(X2:X65)%>%
                  data.matrix() %>%
                  tsne(perplexity = perplexity, initial_dims = initial_dims, max_iter = max_iter)

df_minutes_tf <- data.frame(labels, df_minutes_tf) %>%
                    tbl_df() %>%
                    mutate(labels = factor(labels, levels=labels))

df_minutes_tf
```


```{r}
xscale = 200
yscale = 200

plot <- ggplot(df_minutes_tf, aes(x=X1, y=X2, label=labels, color = labels)) +
          geom_text(family="Source Code Pro Semibold", size=4) +
          theme_void(base_family = "Source Sans Pro", base_size=8) +
          scale_color_viridis(discrete=T, guide=FALSE) + 
          #scale_x_continuous(limits=c(-xscale, xscale)) +
          #scale_y_continuous(limits=c(-yscale, yscale)) +
          labs(title = "2D Projection of Minute Vectors in Reddit Prediction Model",
               subtitle = "Labels closer to each other are more similar in context.",
               caption = "Max Woolf — minimaxir.com") +
          theme(plot.margin = unit(c(0.2,0.2,0.2,0.2),"cm"),
                plot.caption = element_text(size=6, color="#969696"))
          
ggsave("predict-reddit-5.png", plot, width=4, height=3)
```


![](predict-reddit-5.png)