This R Notebook is the complement to my blog post Predicting the Success of a Reddit Submission with Deep Learning and Keras.
This notebook is licensed under the MIT License. If you use the code or data visualization designs contained within this notebook, it would be greatly appreciated if proper attribution is given back to this notebook and/or myself. Thanks! :)
Load data.
df_training <- read_csv("training.csv")
df_dayofyears <- read_delim("dayofyears_embeddings.txt", col_names = F, delim=" ", quote = "—")
df_hours <- read_delim("hours_embeddings.txt", col_names = F, delim=" ", quote = "—")
df_minutes <- read_delim("minutes_embeddings.txt", col_names = F, delim=" ", quote = "—")
# Test Accuracy must be better than this.
no_information_rate <- 0.64137998126
Tidy dataframe to plot Training Accuracy vs. Test Accuracy
df_training_tf <- df_training %>%
mutate(epoch = epoch + 1) %>%
gather(type, perc, main_out_acc, val_main_out_acc)
plot <- ggplot(df_training_tf, aes(epoch, perc, color=type)) +
geom_line() +
geom_point(size=1, fill="white", shape=21) +
geom_hline(yintercept=no_information_rate, linetype="dashed") +
scale_x_continuous(breaks=seq(1,20)) +
scale_y_continuous(labels=percent, limits=c(0.60, 0.75)) +
scale_color_manual(values=c("#2980b9", "#27ae60"), labels=c("Training Data (80%)", "Test Data (20%)")) +
labs(title = "Accuracy of Reddit Predictive Model During Training",
x = "# Epoch",
y = "Accuracy of Model on Dataset",
caption = "Max Woolf —",
color='') +
theme_minimal(base_size=9, base_family="Source Sans Pro") +
theme(plot.title = element_text(size=11, family="Source Sans Pro Bold"),
axis.title.x = element_text(family="Source Sans Pro Semibold"),
axis.title.y = element_text(family="Source Sans Pro Semibold"),
plot.caption = element_text(size=6, color="#969696"),
legend.position="top", legend.margin=margin(t = -0.1, b = -0.25, unit='cm'),
panel.grid.minor = element_blank())
ggsave("predict-reddit-1.png", plot, width=4, height=3)