This R Notebook is the complement to my blog post A Visual Overview of Stack Overflow’s Question Tags.
This notebook is licensed under the MIT License. If you use the code or data visualization designs contained within this notebook, it would be greatly appreciated if proper attribution is given back to this notebook and/or myself. Thanks! :)
library(tidyverse)
[30m── [1mAttaching packages[22m ─────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──[39m
[30m[32m✔[30m [34mggplot2[30m 2.2.1.[31m9000[30m [32m✔[30m [34mpurrr [30m 0.2.4
[32m✔[30m [34mtibble [30m 1.4.2 [32m✔[30m [34mdplyr [30m 0.7.4
[32m✔[30m [34mtidyr [30m 0.8.0 [32m✔[30m [34mstringr[30m 1.2.0
[32m✔[30m [34mreadr [30m 1.1.1 [32m✔[30m [34mforcats[30m 0.2.0 [39m
package ‘tibble’ was built under R version 3.4.3package ‘tidyr’ was built under R version 3.4.3[30m── [1mConflicts[22m ────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[30m [34mdplyr[30m::[32mfilter()[30m masks [34mstats[30m::filter()
[31m✖[30m [34mdplyr[30m::[32mlag()[30m masks [34mstats[30m::lag()[39m
library(lubridate)
Attaching package: ‘lubridate’
The following object is masked from ‘package:base’:
date
library(tidytext) # created at Stack Overflow by Julia Silge and David Robinson
library(scales)
Attaching package: ‘scales’
The following object is masked from ‘package:purrr’:
discard
The following object is masked from ‘package:readr’:
col_factor
library(viridis)
Loading required package: viridisLite
Attaching package: ‘viridis’
The following object is masked from ‘package:viridisLite’:
viridis.map
The following object is masked from ‘package:scales’:
viridis_pal
library(ggrepel)
library(ggridges)
sessionInfo()
R version 3.4.2 (2017-09-28)
Platform: x86_64-apple-darwin15.6.0 (64-bit)
Running under: macOS High Sierra 10.13.3
Matrix products: default
BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] ggridges_0.4.1 ggrepel_0.7.0 viridis_0.4.1 viridisLite_0.3.0 scales_0.5.0
[6] tidytext_0.1.6 lubridate_1.7.1 forcats_0.2.0 stringr_1.2.0 dplyr_0.7.4
[11] purrr_0.2.4 readr_1.1.1 tidyr_0.8.0 tibble_1.4.2 ggplot2_2.2.1.9000
[16] tidyverse_1.2.1
loaded via a namespace (and not attached):
[1] reshape2_1.4.3 haven_1.1.1 lattice_0.20-35 colorspace_1.3-2 SnowballC_0.5.1
[6] yaml_2.1.16 rlang_0.1.6 pillar_1.1.0 foreign_0.8-69 glue_1.2.0
[11] modelr_0.1.1 readxl_1.0.0 bindrcpp_0.2 bindr_0.1 plyr_1.8.4
[16] munsell_0.4.3 gtable_0.2.0 cellranger_1.1.0 rvest_0.3.2 psych_1.7.8
[21] knitr_1.19 parallel_3.4.2 broom_0.4.3 tokenizers_0.1.4 Rcpp_0.12.15
[26] jsonlite_1.5 gridExtra_2.3 mnormt_1.5-5 hms_0.4.1 stringi_1.1.6
[31] grid_3.4.2 cli_1.0.0 tools_3.4.2 magrittr_1.5 lazyeval_0.2.1
[36] janeaustenr_0.1.5 crayon_1.3.4 pkgconfig_2.0.1 Matrix_1.2-12 xml2_1.2.0
[41] assertthat_0.2.0 httr_1.3.1 rstudioapi_0.7 R6_2.2.2 nlme_3.1-131
[46] compiler_3.4.2
Sys.setenv(TZ="America/Los_Angeles")
# https://brandcolors.net/b/stackoverflow
stack_overflow_color <- "#f48024"
theme_set(theme_minimal(base_size=9, base_family="Source Sans Pro") +
theme(plot.title = element_text(size=8, family="Source Sans Pro Bold", margin=margin(t = -0.1, b = 0.1, unit='cm')),
axis.title.x = element_text(size=8),
axis.title.y = element_text(size=8),
plot.subtitle = element_text(family="Source Sans Pro Semibold", color="#969696", size=6),
plot.caption = element_text(size=6, color="#969696"),
legend.text = element_text(size = 6),
legend.key.width = unit(0.25, unit='cm')))
Use data precomputed from this BigQuery:
#standardSQL
SELECT
DATE_TRUNC(DATE(creation_date), YEAR) AS year,
SUM(view_count_delta) AS total_delta
FROM (
SELECT
id,
creation_date,
b.view_count - a.view_count AS view_count_delta
FROM
`fh-bigquery.stackoverflow_archive.201703_posts_questions` a
LEFT JOIN (
SELECT
id,
view_count
FROM
`fh-bigquery.stackoverflow_archive.201712_posts_questions` ) b
USING
(id) )
GROUP BY
year
ORDER BY
year ASC
Load in the precomputed data.
file_path <- "stack_overflow_delta.csv"
df_deltas <- read_csv(file_path) %>% mutate(perc = total_delta / sum(as.numeric(total_delta)))
Parsed with column specification:
cols(
year = col_date(format = ""),
total_delta = col_integer()
)
df_deltas
Overview of 2017 view counts on older posts.
plot <- ggplot(df_deltas %>% filter(year >= ymd('2009-01-01'), year <= ymd('2016-01-01')), aes(x=year, y=perc)) +
geom_bar(alpha=0.9, stat="identity", fill=stack_overflow_color) +
scale_x_date(date_breaks='1 year', date_labels='%Y', minor_breaks = NULL) +
scale_y_continuous(labels=percent) +
labs(title='Proportion of 2017 Views on Older Stack Overflow Questions by Year',
subtitle='From March 13th, 2017 to December 3rd, 2017. Visualization Excludes Partial Years',
x='Year Question Was Posted',
y='% of All Views',
caption = "Max Woolf — minimaxir.com"
)
ggsave('so_overview.png', plot, width=4, height=2)
Data processed from this BigQuery: (NB: to download large datasets, save as a BigQuery table and export as a CSV, then download the CSV)
#standardSQL
WITH
answers_ordered AS (
SELECT
id,
creation_date,
parent_id AS question_id,
score,
ROW_NUMBER() OVER (PARTITION BY parent_id ORDER BY creation_date ASC) AS time_rank,
ROW_NUMBER() OVER (PARTITION BY parent_id ORDER BY score DESC) AS score_rank,
COUNT(*) OVER (PARTITION BY parent_id) AS num_answers
FROM
`fh-bigquery.stackoverflow_archive.201712_posts_answers` )
SELECT
id,
title,
tags,
DATETIME(creation_date) AS creation_date,
accepted_answer_id,
view_count,
score,
num_answers,
f_answer_id,
DATETIME(f_creation_date) AS f_creation_date,
f_score,
f_score_rank,
a_answer_id,
DATETIME(a_creation_date) AS a_creation_date,
a_score,
a_time_rank,
a_score_rank,
TIMESTAMP_DIFF(f_creation_date, creation_date, SECOND) AS time_to_f,
TIMESTAMP_DIFF(a_creation_date, creation_date, SECOND) AS time_to_a
FROM
`fh-bigquery.stackoverflow_archive.201712_posts_questions` q
LEFT JOIN (
SELECT
id AS f_answer_id,
creation_date AS f_creation_date,
question_id AS f_question_id,
score AS f_score,
score_rank AS f_score_rank,
num_answers
FROM
answers_ordered
WHERE
time_rank = 1 ) f
ON
q.id = f.f_question_id
LEFT JOIN (
SELECT
id AS a_answer_id,
creation_date AS a_creation_date,
score AS a_score,
time_rank AS a_time_rank,
score_rank AS a_score_rank
FROM
answers_ordered ) a
ON
q.accepted_answer_id = a.a_answer_id
WHERE
creation_date >= '2017-01-01 00:00:00' AND creation_date < '2017-12-01 00:00:00'
file_path <- "~/Downloads/stack_overflow_2017.csv"
df <- read_csv(file_path, progress=FALSE)
Parsed with column specification:
cols(
id = col_integer(),
title = col_character(),
tags = col_character(),
creation_date = col_datetime(format = ""),
accepted_answer_id = col_integer(),
view_count = col_integer(),
score = col_integer(),
num_answers = col_integer(),
f_answer_id = col_integer(),
f_creation_date = col_datetime(format = ""),
f_score = col_integer(),
f_score_rank = col_integer(),
a_answer_id = col_integer(),
a_creation_date = col_datetime(format = ""),
a_score = col_integer(),
a_time_rank = col_integer(),
a_score_rank = col_integer(),
time_to_f = col_integer(),
time_to_a = col_integer()
)
df %>% head()
Add columns relevant to the timing when the post was made. The raw data is in UTC
, so it must be converted to Eastern.
df <- df %>% mutate(
creation_date = with_tz(creation_date, "America/New_York"),
hour_posted = hour(creation_date),
weekday_posted = wday(creation_date, label=T, abbr=F),
week_posted = floor_date(creation_date, '1 week'),
month_posted = floor_date(creation_date, '1 month')
)
Add a few bespoke features:
df <- df %>% mutate(
num_answers = ifelse(is.na(num_answers), 0, num_answers),
f_answer_in = ifelse(!is.na(time_to_f) & time_to_f < 60*60*4, 1, 0),
a_answer_in = ifelse(!is.na(time_to_a) & time_to_a < 60*60*4, 1, 0),
num_tags = 1 + str_match_all(tags, '\\|') %>% lapply(length) %>% unlist(),
is_answered = ifelse(!is.na(time_to_a), 1, 0)
)
For all questions asked:
plot <- ggplot(df, aes(x=week_posted, y=..count..)) +
geom_bar(fill = stack_overflow_color, alpha=0.9) +
scale_x_datetime(date_breaks='2 months', date_labels='%b') +
scale_y_continuous(labels=comma) +
labs(title='New Stack Overflow Questions in 2017',
x='Week Question was Posted',
y='# of Questions Posted',
caption = "Max Woolf — minimaxir.com")
ggsave('weekly_count.png', plot, width=4, height=2)