Web Scraping in R Using the tidyverse and rvest

Download the full code here: https://joshuamccrain.com/tutorials/rvest/scraping.Rmd

Pipes!

library(tidyverse) 

x <- 2
y <- 5

sum(x, y)

## [1] 7

x %>% sum(y)

## [1] 7

ex <- "Rep. PRICE, David (NC)"

ex2 <- word(ex, 2)
ex3 <- str_replace_all(ex2, "[:punct:]", "")
ex4 <- str_to_title(ex3)

ex4

## [1] "Price"

ex %>% 
  word(2) %>% 
  str_replace_all("[:punct:]", "") %>% 
  str_to_title()

## [1] "Price"

Basic Scraping with `rvest`

# https://www.opensecrets.org/federal-lobbying/top-spenders


# install.packages("rvest") 
library(rvest)

read_html("test.html") %>% 
  html_nodes("div") %>% 
  html_text(trim=T)

## [1] "My First Heading\r\n    My first paragraph."                               
## [2] "Some text\r\n\r\n     something important \r\n     something not important"

# identical to this clumsy thing:
h <- read_html("test.html")
nodes <- html_nodes(h, "div")
html_text(nodes, trim = T)

## [1] "My First Heading\r\n    My first paragraph."                               
## [2] "Some text\r\n\r\n     something important \r\n     something not important"

read_html("test.html") %>% 
  html_nodes("#div1") %>% 
  html_text(trim=T)

## [1] "My First Heading\r\n    My first paragraph."

read_html("test.html") %>% 
  html_nodes("#div1 h1") %>% 
  html_text(trim=T)

## [1] "My First Heading"

read_html("test.html") %>% 
  html_nodes("#div2 span.important") %>% 
  html_text(trim=T)

## [1] "something important"

read_html("https://www.opensecrets.org/revolving/") %>% 
  html_nodes(".datadisplay a") %>% 
  html_text

## [1] "Wallander, Celeste A"   "Choudhry, Nusrat Jahan" "Hilliard, Earl"        
## [4] "Thompson, Tola"         "Whitney, Courtney"

# what about the links?

read_html("https://www.opensecrets.org/revolving/") %>% 
  html_nodes(".datadisplay a") %>% 
  html_attr("href")

## [1] "rev_summary.php?id=82874" "rev_summary.php?id=82872"
## [3] "rev_summary.php?id=82871" "rev_summary.php?id=82870"
## [5] "rev_summary.php?id=82868"

# a specific element on the page?

read_html("https://www.opensecrets.org/revolving/") %>% 
  html_nodes("#latest tr:nth-child(3) td:nth-child(1)") %>% 
  html_text

## [1] "Hilliard, Earl"

Grabbing the whole table:

read_html("https://www.opensecrets.org/revolving/") %>% 
  html_nodes("#latest") %>% 
  html_table

## [[1]]
## # A tibble: 5 x 2
##   X1                     X2                    
##   <chr>                  <chr>                 
## 1 Wallander, Celeste A   US Dept of Defense    
## 2 Choudhry, Nusrat Jahan ACLU of Illinois      
## 3 Hilliard, Earl         Hilliard, Smith & Hunt
## 4 Thompson, Tola         Ballard Partners      
## 5 Whitney, Courtney      Ballard Partners

Scraping multiple pages recursively

Who are the top-spenders in lobbying by year? How many lobbyists do they hire? How many are revolvers?

# https://www.opensecrets.org/federal-lobbying/top-spenders

page <- "https://www.opensecrets.org/federal-lobbying/top-spenders?cycle=2021"

read_html(page) %>% 
  html_nodes("table") %>% 
  html_table

## [[1]]
## # A tibble: 20 x 2
##    `Lobbying Client`                                  `Total Spent`
##    <chr>                                              <chr>        
##  1 US Chamber of Commerce                             $66,390,000  
##  2 National Assn of Realtors                          $44,004,025  
##  3 Pharmaceutical Research & Manufacturers of America $30,377,000  
##  4 Business Roundtable                                $29,120,000  
##  5 Blue Cross/Blue Shield                             $25,176,385  
##  6 American Hospital Assn                             $25,130,934  
##  7 Amazon.com                                         $20,590,000  
##  8 Meta                                               $20,070,000  
##  9 American Medical Assn                              $19,490,000  
## 10 American Chemistry Council                         $16,640,000  
## 11 Raytheon Technologies                              $15,390,000  
## 12 National Assn of Manufacturers                     $15,300,000  
## 13 Lockheed Martin                                    $14,401,911  
## 14 NCTA The Internet & Television Assn                $14,010,000  
## 15 AARP                                               $13,680,000  
## 16 Boeing Co                                          $13,450,000  
## 17 Comcast Corp                                       $13,380,000  
## 18 Biotechnology Innovation Organization              $13,290,000  
## 19 Verizon Communications                             $13,240,000  
## 20 CTIA                                               $12,430,000

Note the url: https://www.opensecrets.org/federal-lobbying/top-spenders?cycle=2021

Do it recursively, save results to a dataframe:

page <- "https://www.opensecrets.org/federal-lobbying/top-spenders?cycle=2021"

top10 <- read_html(page) %>% 
  html_nodes("table") %>% 
  html_table %>% 
  pluck(1) %>% # it returns a list
  slice(1:10) # keep the top 10

# do the same for 2010-2021, but only keep those in the top 10 in 2021
# can see evolution of lobbying spending this way

scrape_fun <- function(year){
  
  # construct the URL
  url <- paste("https://www.opensecrets.org/federal-lobbying/top-spenders?cycle=", year, sep="")
  
  cat("Scraping url: ", url, "\n")
  
  df <- read_html(url) %>% 
    html_nodes("table") %>% 
    html_table %>% 
    pluck(1) %>% 
    filter(`Lobbying Client` %in% top10$`Lobbying Client`) %>%  # keep only clients in the top 10
    mutate(year = year)
  
  return(df)
  
  Sys.sleep(1) # pause for 1 second
  
}

# purrr::map over the year vector

top_spenders <- map_df(seq(2010, 2021), scrape_fun)

## Scraping url:  https://www.opensecrets.org/federal-lobbying/top-spenders?cycle=2010 
## Scraping url:  https://www.opensecrets.org/federal-lobbying/top-spenders?cycle=2011 
## Scraping url:  https://www.opensecrets.org/federal-lobbying/top-spenders?cycle=2012 
## Scraping url:  https://www.opensecrets.org/federal-lobbying/top-spenders?cycle=2013 
## Scraping url:  https://www.opensecrets.org/federal-lobbying/top-spenders?cycle=2014 
## Scraping url:  https://www.opensecrets.org/federal-lobbying/top-spenders?cycle=2015 
## Scraping url:  https://www.opensecrets.org/federal-lobbying/top-spenders?cycle=2016 
## Scraping url:  https://www.opensecrets.org/federal-lobbying/top-spenders?cycle=2017 
## Scraping url:  https://www.opensecrets.org/federal-lobbying/top-spenders?cycle=2018 
## Scraping url:  https://www.opensecrets.org/federal-lobbying/top-spenders?cycle=2019 
## Scraping url:  https://www.opensecrets.org/federal-lobbying/top-spenders?cycle=2020 
## Scraping url:  https://www.opensecrets.org/federal-lobbying/top-spenders?cycle=2021

# equivalent to:

#for(i in 2010:2021){
  # scrape function
#}



glimpse(top_spenders)

## Rows: 90
## Columns: 3
## $ `Lobbying Client` <chr> "US Chamber of Commerce", "Pharmaceutical Research &~
## $ `Total Spent`     <chr> "$132,067,500", "$22,600,000", "$22,555,000", "$21,3~
## $ year              <int> 2010, 2010, 2010, 2010, 2010, 2010, 2011, 2011, 2011~

A quick and dirty visualization:

# convert the total spent to numeric:

top_spenders <- mutate(top_spenders, `Total Spent` = readr::parse_number(`Total Spent`))

ggplot(
  top_spenders,
  aes(
    x = year,
    y = `Total Spent`,
    color = `Lobbying Client`,
    linetype = `Lobbying Client`
  )
) +
  geom_line() +
  scale_y_continuous(labels = scales::dollar_format()) +
  theme_classic()

ggplot(
  top_spenders %>% 
    filter(`Lobbying Client` %in% top10$`Lobbying Client`[1:3]) %>% 
    mutate(`Lobbying Client` = word(`Lobbying Client`, 1, 4)),
  aes(
    x = year,
    y = `Total Spent`,
    color = `Lobbying Client`,
    linetype = `Lobbying Client`
  )
) +
  geom_line(alpha=.5) +
  geom_smooth(se=F) +
  scale_y_continuous(labels = scales::dollar_format()) +
  theme_classic() +
  theme(legend.position="bottom",
        legend.text = element_text(size = 8))

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

A more complex task:

Grab URLs of lobbying client page
Go to each URL, grab list of lobbyists that they hire
Iterate over those URLs to get info about the lobbyists they hire

What’s nice about this sort of operation is it doesn’t depend on what is currently on the page! It automates every step of the process.

Step 1: get the URLs of the 2021 top spenders

urls <- read_html("https://www.opensecrets.org/federal-lobbying/top-spenders?cycle=2021") %>% 
  html_nodes("tbody a") %>% 
  html_attr("href")

head(urls)

## [1] "/federal-lobbying/clients/summary?cycle=2021&id=D000019798&name=US+Chamber+of+Commerce"                              
## [2] "/federal-lobbying/clients/summary?cycle=2021&id=D000000062&name=National+Assn+of+Realtors"                           
## [3] "/federal-lobbying/clients/summary?cycle=2021&id=D000000504&name=Pharmaceutical+Research+%26+Manufacturers+of+America"
## [4] "/federal-lobbying/clients/summary?cycle=2021&id=D000032202&name=Business+Roundtable"                                 
## [5] "/federal-lobbying/clients/summary?cycle=2021&id=D000000109&name=Blue+Cross%2FBlue+Shield"                            
## [6] "/federal-lobbying/clients/summary?cycle=2021&id=D000000116&name=American+Hospital+Assn"

# what we need from these is the ID
# lots of ways to get this with regex
# but regex is complicated so here's a hack:

urls[1] %>% 
  str_split("&") %>% 
  pluck(1,2) %>% 
  str_replace_all("id=","")

## [1] "D000019798"

# lobbyist list url looks like:
# https://www.opensecrets.org/federal-lobbying/clients/lobbyists?cycle=2021&id=D000019798
# so sipmly replace `id=` 


# scrape function:
scrape_fun <- function(url){
  
  #id <- "D000019798"
  
  id <- url %>% 
    str_split("&") %>% 
    pluck(1,2) %>% 
    str_replace_all("id=", "")
  
  new_url <- paste("https://www.opensecrets.org/federal-lobbying/clients/lobbyists?cycle=2021&id=", id, sep = "")
  
  lobs_table <- read_html(new_url) %>% 
    html_nodes("table") %>% 
    html_table %>% 
    pluck(1) %>% 
    slice(1:n()-1) # don't want the last row
  
  # what about URLs for each person?
  lob_links <- read_html(new_url) %>% 
    html_nodes(".component-wrap tbody a") %>% 
    html_attr("href")
  
  # only the lobbyist summary links
  lob_links <- lob_links[str_detect(lob_links, "lobbyists/summary")]
  
  # bind to dataframe:
  
  lobs_table$urls <- lob_links
  
  ## add columns with info about the client:
  lobs_table$id <- id
  
  # percent revolvers:
  revolve <- read_html(new_url) %>% 
    # in the component-wrap div, the third div within it, the text in the h3
    html_nodes(".component-wrap div:nth-child(3) h3") %>% 
    html_text %>% 
    word(2) %>% # the part in parentheses
    str_replace_all("[(]|[)]", "") # remove parentheses
  
  
  # standardized name of client:
  name <- read_html(new_url) %>% 
    html_node(".Hero-title") %>% 
    html_text %>% 
    str_replace("Client Profile: ", "")
  
  lobs_table$perc_revolve <- revolve
  lobs_table$name <- name
  
  lobs_table <- janitor::clean_names(lobs_table)
  
  return(lobs_table)
  
  
  Sys.sleep(.5)
}


top_spenders <- map_df(urls, scrape_fun)

glimpse(top_spenders)

## Rows: 2,000
## Columns: 10
## $ lobbying_firm_hired        <chr> "Akin, Gump et al", "Akin, Gump et al", "Ak~
## $ total_amount               <chr> "$480,000", "$480,000", "$480,000", "$480,0~
## $ client                     <chr> "US Chamber Institute for Legal Reform", "U~
## $ lobbyist                   <chr> "G Hunter Bates", "Brendan Dunn", "Casey Hi~
## $ revolving_door_profiles    <chr> "Revolving Door Profiles", "Revolving Door ~
## $ former_members_of_congress <chr> "Non Former Members of Congress", "Non Form~
## $ urls                       <chr> "/federal-lobbying/lobbyists/summary?cycle=~
## $ id                         <chr> "D000019798", "D000019798", "D000019798", "~
## $ perc_revolve               <chr> "35.56%", "35.56%", "35.56%", "35.56%", "35~
## $ name                       <chr> "US Chamber of Commerce", "US Chamber of Co~

Let’s take a look at what we’ve got:

top_spenders %>% 
  group_by(lobbying_firm_hired) %>% 
  summarize(total = n()) %>% 
  arrange(desc(total)) %>% 
  filter(row_number() <= 10)

## # A tibble: 10 x 2
##    lobbying_firm_hired        total
##    <chr>                      <int>
##  1 Mehlman, Castagnetti et al   152
##  2 US Chamber of Commerce       100
##  3 Capitol Counsel               56
##  4 BGR Group                     46
##  5 Akin, Gump et al              45
##  6 Amazon.com                    40
##  7 Forbes Tate Partners          40
##  8 Tarplin, Downs & Young        40
##  9 American Hospital Assn        33
## 10 Crossroads Strategies         33

top_spenders %>% 
  group_by(name) %>% 
  slice(1) %>% 
  select(name, perc_revolve) %>% 
  arrange(desc(perc_revolve))

## # A tibble: 20 x 2
## # Groups:   name [20]
##    name                                               perc_revolve
##    <chr>                                              <chr>       
##  1 Meta                                               81.94%      
##  2 CTIA                                               81.48%      
##  3 NCTA The Internet & Television Assn                80.39%      
##  4 Comcast Corp                                       78.03%      
##  5 Verizon Communications                             75.00%      
##  6 Business Roundtable                                72.28%      
##  7 Blue Cross/Blue Shield                             70.59%      
##  8 Raytheon Technologies                              69.41%      
##  9 American Chemistry Council                         69.23%      
## 10 Pharmaceutical Research & Manufacturers of America 67.69%      
## 11 Boeing Co                                          66.96%      
## 12 Amazon.com                                         65.77%      
## 13 Lockheed Martin                                    65.22%      
## 14 Biotechnology Innovation Organization              62.64%      
## 15 AARP                                               60.32%      
## 16 National Assn of Manufacturers                     53.85%      
## 17 American Medical Assn                              50.94%      
## 18 National Assn of Realtors                          50.00%      
## 19 American Hospital Assn                             46.88%      
## 20 US Chamber of Commerce                             35.56%

Web Scraping in R Using the tidyverse and rvest

By Josh McCrain
Twitter

2/24/2022

Pipes!

Basic Scraping with `rvest`

Scraping multiple pages recursively

Step 1: get the URLs of the 2021 top spenders

Web Scraping in R Using the tidyverse and rvest

By Josh McCrain Twitter

2/24/2022

Pipes!

Basic Scraping with rvest

Scraping multiple pages recursively

Step 1: get the URLs of the 2021 top spenders

By Josh McCrain
Twitter

Basic Scraping with `rvest`