#########################
#########################
# Introduction to R for Data Manipulation and Analysis
# Josh McCrain | Emory University
# josh.mccrain@emory.edu | joshuamccrain.com
# Prepared for presentation at Georgia State University, October 6 2017
# I thank Steven W. Webster for his assistance (stevenwwebster.com)
#########################
#########################
############################
### 1. Introduction to R ###
############################
### Running code through the console ###
3 + 5
3 * 5
log(50)
exp(5)
exp(log(50))
round(1.9)
### Assigning variables & objects ###
v1 <- 5
v2 <- 3
v1
v2
v1 - v2
v <- v1 - v2
v
### Logical operators ###
10 > 11
v1 > v2
3 == 5 # Note: 2 equal signs, NOT 1!
3 != 5
!(3 == 5) # Same as above
(3 == 5) | (3 < 5) # OR
(3 == 5) & (3 < 5) # AND
if(v1 > v2) {
v3 <- v1 * v2
v3
}
### Vectors ###
vec1 <- c(2, 3, 4, 10, 11)
vec1
vec1 + 10
vec1 ^ 2
vec1 * 10
vec2 <- exp(vec1)
vec2
### Functions ###
my_function <- function(param1, param2){
result <- param1 + param2
return(result)
}
r <- my_function(5, 10)
r
# this function will also take a vector
r <- my_function(vec1, vec2)
r
### Working with vectors ###
# R is an object oriented programming langauge
# What does that mean?
seq1 <- seq(50, 60, by = .5) #seq() creates sequences of numbers
seq1
length(seq1)
seq1[1]
seq1[5]
seq1[5:10]
seq1[seq1 < 55]
# It means you can reference individual aspects of an object, this becomes very useful later on
# Vectors can also consist of strings
movies <- c("The Force Awakens", "Rogue One", "A New Hope", "The Empire Strikes Back", "Return of the Jedi")
movies[1]
### Loops ###
# Loops are not generally necessary in R, and they get to be really slow with large data.
# However, there are some useful applications (this is not one of them)
for(i in 1:length(movies)){
movies2 <- paste(i, ". ", movies[i], sep="")
print(movies2)
}
### Lists ###
# Lists are useful way to deal with data when you have multiple formats
box_office <- c(936.7, 523.2, 307.3, 209.4, 252.6)
str(box_office) #str() gives you basic information about an object, such as its type (num/chr/etc)
str(movies)
starwars <- list() #create a list object
starwars[[1]] <- movies
starwars[[2]] <- box_office
names(starwars) <- c("Movie", "Box_Office") # I can change the name of list elements
starwars
str(starwars)
starwars[[1]][1] # Note how the indexing works with lists
starwars[[2]][1] # This is not the most intuitive thing in my opinion
starwars$Movie # and I can reference elements of the object through the name with the $ (dollar sign)
starwars$Box_Office # the $ is a very important operator in R
str(starwars$Movie) # this is a vector which is stored as an element in a list, so I can treat it as a vector
starwars$Movie[2:4]
starwars[[1]] <- c(starwars[[1]], "The Phantom Menace") # I can also add new elements to a list (or a vector)
starwars[[2]] <- c(starwars[[2]], 431.1) # c() stands for concatenate
starwars
### Dataframes ###
# dataframes are the most versatile data format in R
# and it's easy to format existing data as a dataframe
starwars_df <- data.frame(movie = starwars$Movie, box_office = starwars$Box_Office)
starwars_df
View(starwars_df)
# The strength in dataframes come in their ability to deal with very large data
# let's generate some new data
# I'm going to randomly draw numbers from a poisson distribution
newdata <- rpois(1000, 3.5)
# Aside: in R you can quickly look up functions by placing a ? in front of it, e.g.:
?rpois
?seq
df <- data.frame(person = rep(seq(1, 100), each=10), fake_data = newdata)
head(df, n = 20) # this should look familiar as panel data
df$person
mean(df$fake_data) #pretty close to the mean from the rpois() call above
sd(df$fake_data) #standard deviation
var(df$fake_data) #variance -- hey it's pretty close to the mean!
plot(density(df$fake_data)) # more on visualization later
hist(df$fake_data)
# you can reference individual columns or rows by their index number
df[10,] #row 10
df[,1] #column 1
df$fake_data[5] #or individual cells
# or I can reference individual rows by conditional statments:
df[df$person < 10, ] # this gets all observations for individuals whose ID is less than 10
# I can also manipulate individual values based on conditional statements:
df$fake_data[df$fake_data < 2] <- df$fake_data[df$fake_data < 2] + 1
# this 1) takes all rows where the value of 'fake_data' is less than 2 and then
# 2) it adds 1 to those rows
mean(df$fake_data) #the mean is now higher
# I can also rename columns
names(df) <- c("subject", "data")
head(df)
# The key intuitive takeaway with dataframes is to think of each column as an individual vector.
# In the next session I cover more complex, realistic uses of dataframes.
# For example, what if I want to create a new variable (column) conditional on the value in an existing column?
# What if I want to deal with groups of rows based on the value of a variable?
# cleaning the workspace
rm(df)
rm(starwars_df)
rm(list=ls()) #clears the entire workspace
############################
### 2. Data Manipulation ###
############################
# R's strength is in its packages, which are add-ons written by the open source community.
# The most useful for data manipulation are written by Hadley Wickham in what is known as the tidyverse.
# Here we'll install two of these packages and then load in some real data to work with.
# you only need to run the install.packages code once per package per computer
install.packages("dplyr")
library(dplyr) #you must call this everytime you load R and want to use a package
# this is the most versatile package for working with strings and regular expressions
install.packages("stringr")
library(stringr)
# set your working directory and load in data
# you can set your working directory manually or in RStudio:
# Session > Set Working Directory > To Source File Location (or choose another location)
setwd("C:/Users/Josh/Dropbox/r teaching")
# read in replication data from Gary Jacobson on congressional elections
elections <- read.csv("http://joshuamccrain.com/elections_data.csv")
# You can also save your workspace:
# save.image("C:/Users/Josh/Dropbox/r teaching/workspace.RData")
### dplyr and pipes ###
# dplyr is the best framework within R for complex data manipulation
# but it uses some new notation called "pipes".
# common functions in dplyr:
# filter()
# mutate()
# group_by()
# summarize()
# select()
# left_join()
# arrange()
# what if I only want to look at one election cycle? two identical approaches:
cycle.2002 <- filter(elections, cycle == 2002) # approach 1
cycle.2002 <- elections %>% filter(cycle == 2002) # approach 2
# the pipes are key here and are simple once you understand the intuition.
# for example, these three are the same:
# 1) f(x, y)
# 2) x %>% f(y)
# 3) x %>% f(., y)
# when using pipes, dplyr takes the "piped" in value as the left most argument of the function
# if I don't want it to be the left most argument, I can use the '.' instead:
# f(x, y) is the same as y %>% f(x, .)
# dplyr functions are written to make this easier
## more practical examples
# taking a random sample (useful in bootstrapping, for example):
?sample_n
sample <- elections %>% sample_n(100)
sample <- sample_n(elections, 100)
# what if I only want to keep a few variables?
trimmed <- elections %>% select(CRPFilerID_R, cycle, party)
trimmed <- select(elections, CRPFilerID_R, cycle, party)
# Or create new variables:
elections <- elections %>% mutate(dem_margin = share_D - share_R) #create the margin of victory
# Summarize is very useful for creating interesting summary statistics
# What if I want to know how many observations there are per congressional district?
# This will take two steps: first, filter out Senate elections, then summarize the resulting dataframe
house_elections <- elections %>% filter(!grepl("SE", distIDRunFor))
# grepl() is a useful function for checking content of strings
?grepl
# Now let's summarize()
election_count <- house_elections %>%
group_by(distIDRunFor) %>%
summarize(num_elections = n())
View(election_count)
# group_by() groups the data by the variable of interest, and then does the subsequent operations *within* each group, in this case summarize()
# notice there is now one observation per congressional district, and all other variables have been removed
# summarize condenses the data down so make sure to save the result as a new dataframe
# you can calculate multiple summary statistics at once with this method
# here we'll get some interesting information about Republican expenditures per election year
# we also must remove observations with NAs
house_elections %>%
group_by(cycle) %>%
summarize(mean = mean(Republican_Expenditure_Jacobson, na.rm=T),
max = max(Republican_Expenditure_Jacobson, na.rm=T),
min = min(Republican_Expenditure_Jacobson, na.rm=T),
sd = sd(Republican_Expenditure_Jacobson, na.rm=T)) # View()
# Now let's create a new variable for the state of each election
# We can do this by manipulating the distIDRunFor variable and string manipulation from the stringr package
house_elections <- house_elections %>% mutate(state = str_sub(distIDRunFor, 1, 2))
head(house_elections$state, n=20)
# having information by state could be useful for a number of reasons:
# for instance, data visualization, or state fixed-effects in a panel data model.
# let's calculate a summary statistic by state (average democratic vote by year), then add it back to the main dataframe.
# (this could also be useful as a control variable in a model)
state_dem_vote <- house_elections %>%
group_by(state, cycle) %>% # notice TWO group_by variables,
summarize(mean_dem_vote = mean(share_D), na.rm=T) %>% # so the summarize is done within each state/cycle
select(-na.rm) # using `-` with select() removes the column
### left_join() ###
# This is useful to have, but since summarize reduces the data how do we add it back to our main dataframe?
# the answer is left_join().
# left_join() combines two dataframes based on a column (or columns) with unique identifiers
# here's a toy example:
toydf1 <- data.frame(year = seq(2000, 2005, 1), person = paste(letters[1:6]))
toydf1
toydf2 <- data.frame(year = seq(2000, 2005, 1), salary = rnorm(6, 50000, 10000))
toydf2
toydf <- left_join(toydf1, toydf2, by = "year")
toydf
# there are other join functions: full_join(), anti_join(), inner_join()
# more info is here: https://github.com/rstudio/cheatsheets/raw/master/data-transformation.pdf
## Now let's use this function on our real data, combining state-level information to the individual data
house_elections <- left_join(house_elections, state_dem_vote, by = c("state", "cycle"))
# notice here I am joining based on the value in TWO columns, state and cycle.
# this is incredibly useful in practical data, especially if you work with panel data.
# for example, imagine your unit of observation is a country-year.
### additional examples ###
# let's create some new variables conditional on the value in an existing row.
# this takes advantage of the very useful ifelse() function which uses this syntax:
# ifelse(condition, do if true, do if false)
close_elections <- house_elections %>%
mutate(close_election = ifelse(abs(dem_margin) < 5, 1, 0), #if the election is close, create a dummy variable = 1
dem_close_election = ifelse(close_election == 1 & dem_margin > 0, 1, 0)) #if the election is close AND the democrat won
# here we get more information by expenditures
house_elections <- house_elections %>%
ungroup() %>%
group_by(state, cycle) %>%
mutate(avg_dem_exp = mean(Democrat_Expenditure_Jacobson, na.rm = T),
avg_rep_exp = mean(Republican_Expenditure_Jacobson, na.rm = T))
# removing grouping information [ungroup()] is often a good idea when doing new group_by operations
#############################
### 3. Data Visualization ###
#############################
# Most of what you see in popular media data visualization is done with R and ggplot2.
# (e.g., The Economist, FiveThirtyEight, NYT Upshot)
install.packages("ggplot2")
library(ggplot2)
# let's keep working with house elections data
# ggplot works by creating layers of visualization that are then added onto.
# let's start by plotting some summary statistics.
# first: average democratic vote share by cycle
plot <- ggplot(house_elections, aes(x = cycle, y = share_D))
plot # a blank plot -- let's add to it
plot + geom_smooth(method="lm") # add a line to the plot
plot + stat_smooth(method = "lm", formula = y ~ x + I(x^2), size = 1) # add a quadratic-term line
# some summary statistics of interest:
ggplot(house_elections, aes(x=log(Democrat_Expenditure_Jacobson))) + geom_density()
ggplot(house_elections, aes(x=abs(dem_margin))) + geom_histogram(aes(y=..density..), binwidth=1) +
geom_density(alpha=.2, fill="#FF6666")
# something more interesting: the relationship between vote share and (logged) expenditures
plot2 <- ggplot(house_elections, aes(x = log(Democrat_Expenditure_Jacobson), y = share_D))
plot2 <- plot2 + geom_point(shape = 1, alpha = .7) + geom_smooth(method="lm")
plot2
# let's change the aesthetics:
# themes can be found here: http://ggplot2.tidyverse.org/reference/ggtheme.html
ggplot(house_elections, aes(x = log(Democrat_Expenditure_Jacobson), y = share_D, color = share_D)) +
geom_point(alpha = .7) +
geom_smooth(method="lm", color="red") +
theme_light() +
xlab("(logged) Dem. Expenditures") +
ylab("Dem. Vote Share")
# now let's look at dem vote share broken down by state over time:
ggplot(house_elections, aes(x = cycle, y = share_D)) +
geom_point(alpha = .5) +
geom_smooth(method="lm", se=F) +
facet_wrap(~state) +
theme_light()
# facet_wrap is very useful, for example from previously:
ggplot(house_elections, aes(x = log(Democrat_Expenditure_Jacobson), y = share_D, color = share_D)) +
geom_point(alpha = .7) +
geom_smooth(method="lm", color="red") +
facet_wrap(~cycle) +
theme_light() +
xlab("(logged) Dem. Expenditures") +
ylab("Dem. Vote Share")
# finally, you can change colors based on groups within the data
house_elections %>%
mutate(winner = ifelse(dem_margin > 0, "Democrat", "Republican")) %>% #creating a categorical variable based on who won the election
ggplot(aes(x = log(Democrat_Expenditure_Jacobson), fill=winner)) +
geom_histogram(binwidth=.5, position="dodge")
# a complex example combining dplyr piping and ggplot2
# visualizing the regression discontinuity
# this example comes from code by Gregory Martin (http://polisci.emory.edu/faculty/gjmart2/)
house_elections %>%
mutate(incumbent_status = incumbent_D - incumbent_R,
log_exp_diff_dem = log(1+Democrat_Expenditure_Jacobson) - log(1+Republican_Expenditure_Jacobson),
race_type = recode(incumbent_status, `-1`="R Inc", `0`="Open Seat", `1` = "D Inc")) %>%
arrange(distIDRunFor, cycle) %>%
group_by(distIDRunFor) %>%
mutate(next_dem_margin = lead(dem_margin),
dem_os_winner = dem_margin > 0) %>%
filter(race_type=="Open Seat") %>%
ggplot(aes(x=dem_margin, y=next_dem_margin, group=dem_os_winner)) +
geom_point(alpha=.8) +
geom_smooth(method="lm", formula = y ~ poly(x,3)) +
theme_light()