Download the full code here: https://joshuamccrain.com/tutorials/rvest/scraping.Rmd
library(tidyverse)
<- 2
x <- 5
y
sum(x, y)
## [1] 7
%>% sum(y) x
## [1] 7
<- "Rep. PRICE, David (NC)"
ex
<- word(ex, 2)
ex2 <- str_replace_all(ex2, "[:punct:]", "")
ex3 <- str_to_title(ex3)
ex4
ex4
## [1] "Price"
%>%
ex word(2) %>%
str_replace_all("[:punct:]", "") %>%
str_to_title()
## [1] "Price"
rvest
# https://www.opensecrets.org/federal-lobbying/top-spenders
# install.packages("rvest")
library(rvest)
read_html("test.html") %>%
html_nodes("div") %>%
html_text(trim=T)
## [1] "My First Heading\r\n My first paragraph."
## [2] "Some text\r\n\r\n something important \r\n something not important"
# identical to this clumsy thing:
<- read_html("test.html")
h <- html_nodes(h, "div")
nodes html_text(nodes, trim = T)
## [1] "My First Heading\r\n My first paragraph."
## [2] "Some text\r\n\r\n something important \r\n something not important"
read_html("test.html") %>%
html_nodes("#div1") %>%
html_text(trim=T)
## [1] "My First Heading\r\n My first paragraph."
read_html("test.html") %>%
html_nodes("#div1 h1") %>%
html_text(trim=T)
## [1] "My First Heading"
read_html("test.html") %>%
html_nodes("#div2 span.important") %>%
html_text(trim=T)
## [1] "something important"
read_html("https://www.opensecrets.org/revolving/") %>%
html_nodes(".datadisplay a") %>%
html_text
## [1] "Wallander, Celeste A" "Choudhry, Nusrat Jahan" "Hilliard, Earl"
## [4] "Thompson, Tola" "Whitney, Courtney"
# what about the links?
read_html("https://www.opensecrets.org/revolving/") %>%
html_nodes(".datadisplay a") %>%
html_attr("href")
## [1] "rev_summary.php?id=82874" "rev_summary.php?id=82872"
## [3] "rev_summary.php?id=82871" "rev_summary.php?id=82870"
## [5] "rev_summary.php?id=82868"
# a specific element on the page?
read_html("https://www.opensecrets.org/revolving/") %>%
html_nodes("#latest tr:nth-child(3) td:nth-child(1)") %>%
html_text
## [1] "Hilliard, Earl"
Grabbing the whole table:
read_html("https://www.opensecrets.org/revolving/") %>%
html_nodes("#latest") %>%
html_table
## [[1]]
## # A tibble: 5 x 2
## X1 X2
## <chr> <chr>
## 1 Wallander, Celeste A US Dept of Defense
## 2 Choudhry, Nusrat Jahan ACLU of Illinois
## 3 Hilliard, Earl Hilliard, Smith & Hunt
## 4 Thompson, Tola Ballard Partners
## 5 Whitney, Courtney Ballard Partners
Who are the top-spenders in lobbying by year? How many lobbyists do they hire? How many are revolvers?
# https://www.opensecrets.org/federal-lobbying/top-spenders
<- "https://www.opensecrets.org/federal-lobbying/top-spenders?cycle=2021"
page
read_html(page) %>%
html_nodes("table") %>%
html_table
## [[1]]
## # A tibble: 20 x 2
## `Lobbying Client` `Total Spent`
## <chr> <chr>
## 1 US Chamber of Commerce $66,390,000
## 2 National Assn of Realtors $44,004,025
## 3 Pharmaceutical Research & Manufacturers of America $30,377,000
## 4 Business Roundtable $29,120,000
## 5 Blue Cross/Blue Shield $25,176,385
## 6 American Hospital Assn $25,130,934
## 7 Amazon.com $20,590,000
## 8 Meta $20,070,000
## 9 American Medical Assn $19,490,000
## 10 American Chemistry Council $16,640,000
## 11 Raytheon Technologies $15,390,000
## 12 National Assn of Manufacturers $15,300,000
## 13 Lockheed Martin $14,401,911
## 14 NCTA The Internet & Television Assn $14,010,000
## 15 AARP $13,680,000
## 16 Boeing Co $13,450,000
## 17 Comcast Corp $13,380,000
## 18 Biotechnology Innovation Organization $13,290,000
## 19 Verizon Communications $13,240,000
## 20 CTIA $12,430,000
Note the url: https://www.opensecrets.org/federal-lobbying/top-spenders?cycle=2021
Do it recursively, save results to a dataframe:
<- "https://www.opensecrets.org/federal-lobbying/top-spenders?cycle=2021"
page
<- read_html(page) %>%
top10 html_nodes("table") %>%
%>%
html_table pluck(1) %>% # it returns a list
slice(1:10) # keep the top 10
# do the same for 2010-2021, but only keep those in the top 10 in 2021
# can see evolution of lobbying spending this way
<- function(year){
scrape_fun
# construct the URL
<- paste("https://www.opensecrets.org/federal-lobbying/top-spenders?cycle=", year, sep="")
url
cat("Scraping url: ", url, "\n")
<- read_html(url) %>%
df html_nodes("table") %>%
%>%
html_table pluck(1) %>%
filter(`Lobbying Client` %in% top10$`Lobbying Client`) %>% # keep only clients in the top 10
mutate(year = year)
return(df)
Sys.sleep(1) # pause for 1 second
}
# purrr::map over the year vector
<- map_df(seq(2010, 2021), scrape_fun) top_spenders
## Scraping url: https://www.opensecrets.org/federal-lobbying/top-spenders?cycle=2010
## Scraping url: https://www.opensecrets.org/federal-lobbying/top-spenders?cycle=2011
## Scraping url: https://www.opensecrets.org/federal-lobbying/top-spenders?cycle=2012
## Scraping url: https://www.opensecrets.org/federal-lobbying/top-spenders?cycle=2013
## Scraping url: https://www.opensecrets.org/federal-lobbying/top-spenders?cycle=2014
## Scraping url: https://www.opensecrets.org/federal-lobbying/top-spenders?cycle=2015
## Scraping url: https://www.opensecrets.org/federal-lobbying/top-spenders?cycle=2016
## Scraping url: https://www.opensecrets.org/federal-lobbying/top-spenders?cycle=2017
## Scraping url: https://www.opensecrets.org/federal-lobbying/top-spenders?cycle=2018
## Scraping url: https://www.opensecrets.org/federal-lobbying/top-spenders?cycle=2019
## Scraping url: https://www.opensecrets.org/federal-lobbying/top-spenders?cycle=2020
## Scraping url: https://www.opensecrets.org/federal-lobbying/top-spenders?cycle=2021
# equivalent to:
#for(i in 2010:2021){
# scrape function
#}
glimpse(top_spenders)
## Rows: 90
## Columns: 3
## $ `Lobbying Client` <chr> "US Chamber of Commerce", "Pharmaceutical Research &~
## $ `Total Spent` <chr> "$132,067,500", "$22,600,000", "$22,555,000", "$21,3~
## $ year <int> 2010, 2010, 2010, 2010, 2010, 2010, 2011, 2011, 2011~
A quick and dirty visualization:
# convert the total spent to numeric:
<- mutate(top_spenders, `Total Spent` = readr::parse_number(`Total Spent`))
top_spenders
ggplot(
top_spenders,aes(
x = year,
y = `Total Spent`,
color = `Lobbying Client`,
linetype = `Lobbying Client`
)+
) geom_line() +
scale_y_continuous(labels = scales::dollar_format()) +
theme_classic()
ggplot(
%>%
top_spenders filter(`Lobbying Client` %in% top10$`Lobbying Client`[1:3]) %>%
mutate(`Lobbying Client` = word(`Lobbying Client`, 1, 4)),
aes(
x = year,
y = `Total Spent`,
color = `Lobbying Client`,
linetype = `Lobbying Client`
)+
) geom_line(alpha=.5) +
geom_smooth(se=F) +
scale_y_continuous(labels = scales::dollar_format()) +
theme_classic() +
theme(legend.position="bottom",
legend.text = element_text(size = 8))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
A more complex task:
What’s nice about this sort of operation is it doesn’t depend on what is currently on the page! It automates every step of the process.
<- read_html("https://www.opensecrets.org/federal-lobbying/top-spenders?cycle=2021") %>%
urls html_nodes("tbody a") %>%
html_attr("href")
head(urls)
## [1] "/federal-lobbying/clients/summary?cycle=2021&id=D000019798&name=US+Chamber+of+Commerce"
## [2] "/federal-lobbying/clients/summary?cycle=2021&id=D000000062&name=National+Assn+of+Realtors"
## [3] "/federal-lobbying/clients/summary?cycle=2021&id=D000000504&name=Pharmaceutical+Research+%26+Manufacturers+of+America"
## [4] "/federal-lobbying/clients/summary?cycle=2021&id=D000032202&name=Business+Roundtable"
## [5] "/federal-lobbying/clients/summary?cycle=2021&id=D000000109&name=Blue+Cross%2FBlue+Shield"
## [6] "/federal-lobbying/clients/summary?cycle=2021&id=D000000116&name=American+Hospital+Assn"
# what we need from these is the ID
# lots of ways to get this with regex
# but regex is complicated so here's a hack:
1] %>%
urls[str_split("&") %>%
pluck(1,2) %>%
str_replace_all("id=","")
## [1] "D000019798"
# lobbyist list url looks like:
# https://www.opensecrets.org/federal-lobbying/clients/lobbyists?cycle=2021&id=D000019798
# so sipmly replace `id=`
# scrape function:
<- function(url){
scrape_fun
#id <- "D000019798"
<- url %>%
id str_split("&") %>%
pluck(1,2) %>%
str_replace_all("id=", "")
<- paste("https://www.opensecrets.org/federal-lobbying/clients/lobbyists?cycle=2021&id=", id, sep = "")
new_url
<- read_html(new_url) %>%
lobs_table html_nodes("table") %>%
%>%
html_table pluck(1) %>%
slice(1:n()-1) # don't want the last row
# what about URLs for each person?
<- read_html(new_url) %>%
lob_links html_nodes(".component-wrap tbody a") %>%
html_attr("href")
# only the lobbyist summary links
<- lob_links[str_detect(lob_links, "lobbyists/summary")]
lob_links
# bind to dataframe:
$urls <- lob_links
lobs_table
## add columns with info about the client:
$id <- id
lobs_table
# percent revolvers:
<- read_html(new_url) %>%
revolve # in the component-wrap div, the third div within it, the text in the h3
html_nodes(".component-wrap div:nth-child(3) h3") %>%
%>%
html_text word(2) %>% # the part in parentheses
str_replace_all("[(]|[)]", "") # remove parentheses
# standardized name of client:
<- read_html(new_url) %>%
name html_node(".Hero-title") %>%
%>%
html_text str_replace("Client Profile: ", "")
$perc_revolve <- revolve
lobs_table$name <- name
lobs_table
<- janitor::clean_names(lobs_table)
lobs_table
return(lobs_table)
Sys.sleep(.5)
}
<- map_df(urls, scrape_fun)
top_spenders
glimpse(top_spenders)
## Rows: 2,000
## Columns: 10
## $ lobbying_firm_hired <chr> "Akin, Gump et al", "Akin, Gump et al", "Ak~
## $ total_amount <chr> "$480,000", "$480,000", "$480,000", "$480,0~
## $ client <chr> "US Chamber Institute for Legal Reform", "U~
## $ lobbyist <chr> "G Hunter Bates", "Brendan Dunn", "Casey Hi~
## $ revolving_door_profiles <chr> "Revolving Door Profiles", "Revolving Door ~
## $ former_members_of_congress <chr> "Non Former Members of Congress", "Non Form~
## $ urls <chr> "/federal-lobbying/lobbyists/summary?cycle=~
## $ id <chr> "D000019798", "D000019798", "D000019798", "~
## $ perc_revolve <chr> "35.56%", "35.56%", "35.56%", "35.56%", "35~
## $ name <chr> "US Chamber of Commerce", "US Chamber of Co~
Let’s take a look at what we’ve got:
%>%
top_spenders group_by(lobbying_firm_hired) %>%
summarize(total = n()) %>%
arrange(desc(total)) %>%
filter(row_number() <= 10)
## # A tibble: 10 x 2
## lobbying_firm_hired total
## <chr> <int>
## 1 Mehlman, Castagnetti et al 152
## 2 US Chamber of Commerce 100
## 3 Capitol Counsel 56
## 4 BGR Group 46
## 5 Akin, Gump et al 45
## 6 Amazon.com 40
## 7 Forbes Tate Partners 40
## 8 Tarplin, Downs & Young 40
## 9 American Hospital Assn 33
## 10 Crossroads Strategies 33
%>%
top_spenders group_by(name) %>%
slice(1) %>%
select(name, perc_revolve) %>%
arrange(desc(perc_revolve))
## # A tibble: 20 x 2
## # Groups: name [20]
## name perc_revolve
## <chr> <chr>
## 1 Meta 81.94%
## 2 CTIA 81.48%
## 3 NCTA The Internet & Television Assn 80.39%
## 4 Comcast Corp 78.03%
## 5 Verizon Communications 75.00%
## 6 Business Roundtable 72.28%
## 7 Blue Cross/Blue Shield 70.59%
## 8 Raytheon Technologies 69.41%
## 9 American Chemistry Council 69.23%
## 10 Pharmaceutical Research & Manufacturers of America 67.69%
## 11 Boeing Co 66.96%
## 12 Amazon.com 65.77%
## 13 Lockheed Martin 65.22%
## 14 Biotechnology Innovation Organization 62.64%
## 15 AARP 60.32%
## 16 National Assn of Manufacturers 53.85%
## 17 American Medical Assn 50.94%
## 18 National Assn of Realtors 50.00%
## 19 American Hospital Assn 46.88%
## 20 US Chamber of Commerce 35.56%