# checking to see if the pacman package is installed + installing it if needed
if(require(pacman)==FALSE) install.packages("pacman")
pacman::p_load(tidyverse,
rvest,
lubridate, # to fix the date
magrittr)
When you click on Found and Impounded Property Listing on the Property - Lost, Found, Impounded Page, you will be taken to a Google Doc containing a table of lost and found items.
Please scrape the table and print it out. Your code should be self-contained in the code chunk below.
read_html("https://docs.google.com/spreadsheets/d/e/2PACX-1vQ3uk9AJOMODxS9fUgX_4vnEMj-Di7ulkTXWzPUmaHvHbaII63xmKmRu3VaBvOXrwQhtkOUlL9fxLMB/pubhtml?gid=1104208671&single=true") %>%
# select html elements we want (the only table)
html_elements("table") %>%
html_table() -> lost_and_found
lost_and_found=lost_and_found[[1]]
lost_and_found
## # A tibble: 60 x 5
## `` `` `` `` ``
## <int> <chr> <chr> <chr> <chr>
## 1 1 "Date Found / Impounded / Turned into MU Police Dept" "Ite~ "Gen~ "Loc~
## 2 NA "" "" "" ""
## 3 2 "11/24/2021" "Wal~ "Wal~ "Law~
## 4 3 "11/29/2021" "Key~ "Key~ "Arm~
## 5 4 "12/1/2021" "Ele~ "Air~ "Arm~
## 6 5 "12/1/2021" "Key~ "Key~ "Arm~
## 7 6 "12/7/2021" "Cre~ "Cre~ "Arm~
## 8 7 "12/1/2021" "Ele~ "Air~ "Arm~
## 9 8 "12/8/2021" "Cre~ "Cre~ "Rec~
## 10 9 "12/10/2021" "Wal~ "Wal~ "101~
## # ... with 50 more rows
colnames(lost_and_found) = lost_and_found[1, ] # rename columns after the first row
lost_and_found[-1, ] -> lost_and_found
lost_and_found #CHECKING TO SEE IF ROW REMOVED
## # A tibble: 59 x 5
## `1` `Date Found / Impou~` `Item Category` `General Descr~` `Location Found`
## <int> <chr> <chr> <chr> <chr>
## 1 NA "" "" "" ""
## 2 2 "11/24/2021" "Wallet/Purses" "Wallet" "Laws Hall"
## 3 3 "11/29/2021" "Keys" "Keys" "Armstrong"
## 4 4 "12/1/2021" "Electronics" "Airpods" "Armstrong"
## 5 5 "12/1/2021" "Keys" "Keys" "Armstrong"
## 6 6 "12/7/2021" "Credit Card" "Credit Card" "Armstrong"
## 7 7 "12/1/2021" "Electronics" "Airpods" "Armstrong"
## 8 8 "12/8/2021" "Credit Card" "Credit Card" "Rec Center"
## 9 9 "12/10/2021" "Wallet/Purses" "Wallet" "101 McGuffey"
## 10 10 "12/15/2021" "Other misc it~ "Wheel" "Unknown"
## # ... with 49 more rows
lost_and_found[-1, ] -> lost_and_found
lost_and_found #CHECKING TO SEE IF ROW REMOVED
## # A tibble: 58 x 5
## `1` `Date Found / Impou~` `Item Category` `General Descr~` `Location Found`
## <int> <chr> <chr> <chr> <chr>
## 1 2 11/24/2021 Wallet/Purses Wallet Laws Hall
## 2 3 11/29/2021 Keys Keys Armstrong
## 3 4 12/1/2021 Electronics Airpods Armstrong
## 4 5 12/1/2021 Keys Keys Armstrong
## 5 6 12/7/2021 Credit Card Credit Card Armstrong
## 6 7 12/1/2021 Electronics Airpods Armstrong
## 7 8 12/8/2021 Credit Card Credit Card Rec Center
## 8 9 12/10/2021 Wallet/Purses Wallet 101 McGuffey
## 9 10 12/15/2021 Other misc ite~ Wheel Unknown
## 10 11 12/16/2021 Wallet/Purses Wallet Unknown
## # ... with 48 more rows
lost_and_found[ ,-1] -> lost_and_found
lost_and_found #CHECKING TO SEE IF COLUMN REMOVED
## # A tibble: 58 x 4
## `Date Found / Impounded /~` `Item Category` `General Descr~` `Location Found`
## <chr> <chr> <chr> <chr>
## 1 11/24/2021 Wallet/Purses Wallet Laws Hall
## 2 11/29/2021 Keys Keys Armstrong
## 3 12/1/2021 Electronics Airpods Armstrong
## 4 12/1/2021 Keys Keys Armstrong
## 5 12/7/2021 Credit Card Credit Card Armstrong
## 6 12/1/2021 Electronics Airpods Armstrong
## 7 12/8/2021 Credit Card Credit Card Rec Center
## 8 12/10/2021 Wallet/Purses Wallet 101 McGuffey
## 9 12/15/2021 Other misc ite~ Wheel Unknown
## 10 12/16/2021 Wallet/Purses Wallet Unknown
## # ... with 48 more rows
Currently, the Farmer School of Business has the following academic departments: - Accountancy
- Economics - Entrepreneurship
- Finance - Information Systems & Analytics
- Marketing
- Management
Using the code chunk below, please write code that will produce and print a single tibble containing information on ALL departments and the following variables: (a) department name, (b) faculty/staff’s name, (c) faculty/staff’s position, and (d) faculty/staff’s website
depts = c('Accountancy','Economics','Entrepreneurship', 'Finance', 'ISA', 'Marketing', 'Management')
base_url1 = 'https://www.miamioh.edu/fsb/academics/'
base_url2 = '/about/faculty-staff/index.html'
staff_complete = tibble()
for (i in 1:length(depts)){
url = paste0(base_url1,depts[i],base_url2)
read_html(url) -> htmldata
htmldata %>%
html_elements(css = "td:nth-child(1)") %>%
html_text2() -> staff_dept
htmldata %>%
html_elements(css = "strong > a") %>%
html_text2() -> staff_name
htmldata %>%
html_elements(css = "i") %>%
html_text(trim = TRUE) -> staff_position
htmldata %>%
html_elements(css = "strong > a") %>%
html_attr('href') -> staff_website
staff = tibble(staff_dept, staff_name, staff_position, staff_website)
staff_complete = rbind(staff_complete, staff)
}
staff_complete
## # A tibble: 188 x 4
## staff_dept staff_name staff_position staff_website
## <chr> <chr> <chr> <chr>
## 1 Accountancy Dr. Brian Ballou EY Professor of Accountancy, ~ http://miami~
## 2 Accountancy Dr. William Brink Associate Professor http://miami~
## 3 Accountancy Dr. Po-Chang Chen Endres Associate Professor Fe~ http://miami~
## 4 Accountancy Dr. Timothy Eaton Professor http://miami~
## 5 Accountancy Dr. Jan Eighme Teaching Professor http://miami~
## 6 Accountancy Dr. Anne Farrell PricewaterhouseCoopers Profes~ http://miami~
## 7 Accountancy Dr. Michele Frank Assistant Professor http://miami~
## 8 Accountancy Mrs. Deborah Gentry Administrative Assistant http://miami~
## 9 Accountancy Dr. Jonathan Grenier Professor &Miami PRIME Direct~ http://miami~
## 10 Accountancy Dr. Dan Heitger Deloitte Professor &William I~ http://miami~
## # ... with 178 more rows
The most popular listings on Netflix are rated and reviews on ImDb. Based on this webpage and its following pages, please create a tibble that contains the following:
Your tibble should contain a variable for the 9 items above for each of the 50 titles found on the page.
Netflix_Ratings = read_html("https://www.imdb.com/search/title/?companies=co0144901")
#Create Titles Variable
Netflix_Ratings %>% html_elements(css = "div.lister-list > div.lister-item.mode-advanced
> div.lister-item-content > h3.lister-item-header > a") %>%
html_text() -> Netflix_titles
#Create Variable for movie years
Netflix_Ratings %>%
html_elements(css='span.lister-item-year.text-muted.unbold') %>%
html_text()-> Netflix_years
#Create Variable for Age Classification
Netflix_Ratings %>% html_elements("span.certificate") %>%
html_text() ->
Netflix_Age_Classification
# Create Variable for duration
Netflix_Ratings %>%
html_elements(css = "span.runtime") %>%
html_text(trim = TRUE) -> Netflix_duration
# Create Variable for Genre's
Netflix_Ratings %>%
html_elements(css = "span.genre") %>%
html_text(trim = TRUE) -> Netflix_genre
# Create Variable for rate
Netflix_Ratings %>%
html_elements(css = "div.inline-block.ratings-imdb-rating") %>%
html_text(trim = TRUE) -> Netflix_rate
# Create Variable for 1-2 sentence summary
Netflix_Ratings %>%
html_elements(css='div.lister-item-content > p:nth-child(4)') %>%
html_text(trim = TRUE) -> Netflix_summary
Netflix_Ratings %>%
html_elements(css='div.lister-item-content > p:nth-child(5)') %>%
html_text(trim = TRUE) -> Netflix_stars
# Create vote variable and fix where there is a lack of votes
first_vote ='div:nth-child('
second_vote = ') > div.lister-item-content > p.sort-num_votes-visible > span:nth-child(2)'
Netflix_votes = tibble()
for (i in 1:50) {
cssurl = paste0(first_vote,toString(i),second_vote)
Netflix_Ratings %>%
html_elements(css=cssurl) %>%
html_text(trim = TRUE) -> vote
if (length(vote) == 0){
vote = "N/A"
}
Netflix_votes = rbind(Netflix_votes,vote)
}
# Combine all variables for a tibble
IMDB_Movies = tibble(Netflix_titles, Netflix_Age_Classification, Netflix_genre, Netflix_summary, Netflix_years, Netflix_votes)
IMDB_Movies
## # A tibble: 50 x 6
## Netflix_titles Netflix_Age_Cla~ Netflix_genre Netflix_summary Netflix_years
## <chr> <chr> <chr> <chr> <chr>
## 1 Attack on Titan TV-MA Animation, A~ After his home~ (2013–2022)
## 2 The Power of th~ R Drama, Roman~ Charismatic ra~ (2021)
## 3 Ozark TV-MA Crime, Drama~ A financial ad~ (2017–2022)
## 4 The Woman in th~ TV-MA Comedy, Crim~ When a handsom~ (2022)
## 5 Sweet Magnolias TV-14 Drama, Roman~ Three South Ca~ (2020– )
## 6 The Tinder Swin~ TV-MA Documentary,~ Posing as a we~ (2022)
## 7 All of Us Are D~ TV-MA Action, Dram~ A high school ~ (2022– )
## 8 Inventing Anna TV-MA Drama A journalist w~ (2022)
## 9 Murderville TV-MA Comedy, Crim~ Eccentric dete~ (2022– )
## 10 Demon Slayer: K~ TV-MA Animation, A~ A family is at~ (2019– )
## # ... with 40 more rows, and 1 more variable: X.324.249. <chr>
Expand on the previous example to capture the top 300 titles on Netflix (i.e., the information across six pages).
## For this question, we excludedthe variables that experienced missing values.
## Our final tibble includes, Titles, Genre, Summary, and Years
# Create lists for less problematic variables we may use in final tibble
Netflix_titles = list()
Netflix_years = list()
Netflix_Age_Classification = list()
Netflix_genre = list()
Netflix_Summary = list()
Netflix_Ratings = read_html("https://www.imdb.com/search/title/?companies=co0144901")
#Create Titles Variable
Netflix_Ratings %>% html_elements(css = "div.lister-list > div.lister-item.mode-advanced
> div.lister-item-content > h3.lister-item-header > a") %>%
html_text() -> Netflix_titles
#Create Variable for movie years
Netflix_Ratings %>%
html_elements(css='span.lister-item-year.text-muted.unbold') %>%
html_text()-> Netflix_years
#Create Variable for Age Classification
Netflix_Ratings %>% html_elements("span.certificate") %>%
html_text(trim = TRUE) -> Netflix_Age_Classification
# --- Commenting out the variables that are missing values ---
# Create Variable for duration
#Netflix_Ratings %>%
# html_elements(css = "span.runtime") %>%
#html_text(trim = TRUE) -> Netflix_duration
# Create Variable for Genre's
Netflix_Ratings %>%
html_elements(css = "span.genre") %>%
html_text(trim = TRUE) -> Netflix_genre
# --- Commenting out the variables that are missing values ---
# Create Variable for rate
#Netflix_Ratings %>%
# html_elements(css = "div.inline-block.ratings-imdb-rating") %>%
#html_text(trim = TRUE) -> Netflix_rate
# Create Variable for 1-2 sentence summary
Netflix_Ratings %>%
html_elements(css='div.lister-item-content > p:nth-child(4)') %>%
html_text(trim = TRUE) -> Netflix_summary
# --- Commenting out the variables that are missing values ---
#Netflix_Ratings %>%
# html_elements(css='div.lister-item-content > p:nth-child(5)') %>%
#html_text(trim = TRUE) -> Netflix_stars
# Now begin contructing the for loop that will run through each page
all_pages = list(51, 101, 151, 201, 251)
base_url = "https://www.imdb.com/search/title/?companies=co0144901&start="
end_url = "&ref_=adv_nxt"
for (i in all_pages) {
Netflix_Ratings = paste0(base_url, i, end_url)
Netflix_Ratings %>%
read_html() %>%
html_elements(css='h3 > a') %>%
html_text()-> Title
Netflix_titles = append(Netflix_titles,Title)
Netflix_Ratings %>%
read_html() %>%
html_elements(css='span.lister-item-year.text-muted.unbold') %>%
html_text()-> Year
Netflix_years = append(Netflix_years,Year)
Netflix_Ratings %>%
read_html() %>%
html_elements(css='span.genre') %>%
html_text(trim = TRUE) -> Genre
Netflix_genre = append(Netflix_genre,Genre)
Netflix_Ratings %>%
read_html() %>%
html_elements(css='div.lister-item-content > p:nth-child(4)') %>%
html_text(trim = TRUE) -> Summary
Netflix_summary = append(Netflix_summary,Summary)
vote1 ='div:nth-child('
vote2 = ') > div.lister-item-content > p.sort-num_votes-visible > span:nth-child(2)'
Netflix_votes = tibble()
Netflix_Ratings %>%
read_html() %>%
html_elements(css='span.certificate') %>%
html_text(trim = TRUE) -> AgeClass
Netflix_Age_Classification = append(Netflix_Age_Classification,AgeClass)
}
imdb = tibble(Netflix_titles, Netflix_genre, Netflix_summary, Netflix_years)
imdb
## # A tibble: 300 x 4
## Netflix_titles Netflix_genre Netflix_summary Netflix_years
## <chr> <chr> <chr> <chr>
## 1 Attack on Titan Animation, Acti~ After his home~ (2013–2022)
## 2 The Power of the Dog Drama, Romance,~ Charismatic ra~ (2021)
## 3 Ozark Crime, Drama, T~ A financial ad~ (2017–2022)
## 4 The Woman in the House Comedy, Crime, ~ When a handsom~ (2022)
## 5 Sweet Magnolias Drama, Romance Three South Ca~ (2020– )
## 6 The Tinder Swindler Documentary, Cr~ Posing as a we~ (2022)
## 7 All of Us Are Dead Action, Drama, ~ A high school ~ (2022– )
## 8 Inventing Anna Drama A journalist w~ (2022)
## 9 Murderville Comedy, Crime, ~ Eccentric dete~ (2022– )
## 10 Demon Slayer: Kimetsu no Yaiba Animation, Acti~ A family is at~ (2019– )
## # ... with 290 more rows
In assignment 02, I shared with you an RDS file containing four variables and all the reviews that were performed on Patterson Cafe on Yelp. Use what you have learned in class to potentially recreate the same results.
# --- Utilizing robots.txt to see if we are allowed to scrape page ---
# at www.yelp.com/robots.txt, it states, “Disallow: /biz/*destination=*” which prevents us from scraping the yelp page for Patterson’s as it follows that same path. Below is the code we worked on prior to learning this information. The trouble we encountered converting the stars into a score may be explained by the fact that they are considered images upon inspection of the html.
patterson_reviews = read_html("https://www.yelp.com/biz/pattersons-cafe-oxford")
reviews = tibble()
for(page_result in seq(from = 1, to = 90, by = 10)) {
link = paste0("https://www.yelp.com/biz/pattersons-cafe-oxford?start=", page_result,
"90"
)
patterson_reviews = read_html("https://www.yelp.com/biz/pattersons-cafe-oxford")
#create reviewer
patterson_reviews %>% html_elements(css = ".css-1iikwpv .css-1422juy") %>%
html_text2() -> reviewer
reviewer
str(reviewer)
#create review date
patterson_reviews %>% html_elements(css = ".margin-b1-5__09f24__NHcQi .css-1e4fdj9") %>%
html_text2() -> review_date
review_date
str(review_date)
#create score
patterson_reviews %>% html_elements(css = ".margin-b1-5__09f24__NHcQi .overflow--hidden__09f24___ayzG") %>%
html_text2() -> score
score
str(score)
# create comments
patterson_reviews %>% html_elements(css = ".comment__09f24__gu0rG .raw__09f24__T4Ezm") %>%
html_text2() -> comments
comments
str(comments)
}
## chr [1:10] "Dana M." "Melanie U." "Ken N." "Landon C." "Missy H." ...
## chr [1:10] "11/30/2021" "7/21/2021" "9/24/2020" "7/22/2021" "10/24/2021" ...
## chr [1:10] "" "" "" "" "" "" "" "" "" ""
## chr [1:10] "Fabulous gluten free Monte Cristi. Huge pancakes aabd large portions of spaghetti like home fries." ...
## chr [1:10] "Dana M." "Melanie U." "Ken N." "Landon C." "Missy H." ...
## chr [1:10] "11/30/2021" "7/21/2021" "9/24/2020" "7/22/2021" "10/24/2021" ...
## chr [1:10] "" "" "" "" "" "" "" "" "" ""
## chr [1:10] "Fabulous gluten free Monte Cristi. Huge pancakes aabd large portions of spaghetti like home fries." ...
## chr(0)
## chr(0)
## chr(0)
## chr(0)
## chr [1:10] "Dana M." "Melanie U." "Ken N." "Landon C." "Missy H." ...
## chr [1:10] "11/30/2021" "7/21/2021" "9/24/2020" "7/22/2021" "10/24/2021" ...
## chr [1:10] "" "" "" "" "" "" "" "" "" ""
## chr [1:10] "Fabulous gluten free Monte Cristi. Huge pancakes aabd large portions of spaghetti like home fries." ...
## chr [1:10] "Dana M." "Melanie U." "Ken N." "Landon C." "Missy H." ...
## chr [1:10] "11/30/2021" "7/21/2021" "9/24/2020" "7/22/2021" "10/24/2021" ...
## chr [1:10] "" "" "" "" "" "" "" "" "" ""
## chr [1:10] "Fabulous gluten free Monte Cristi. Huge pancakes aabd large portions of spaghetti like home fries." ...
## chr [1:10] "Dana M." "Melanie U." "Ken N." "Landon C." "Missy H." ...
## chr [1:10] "11/30/2021" "7/21/2021" "9/24/2020" "7/22/2021" "10/24/2021" ...
## chr [1:10] "" "" "" "" "" "" "" "" "" ""
## chr [1:10] "Fabulous gluten free Monte Cristi. Huge pancakes aabd large portions of spaghetti like home fries." ...
## chr [1:10] "Dana M." "Melanie U." "Ken N." "Landon C." "Missy H." ...
## chr [1:10] "11/30/2021" "7/21/2021" "9/24/2020" "7/22/2021" "10/24/2021" ...
## chr [1:10] "" "" "" "" "" "" "" "" "" ""
## chr [1:10] "Fabulous gluten free Monte Cristi. Huge pancakes aabd large portions of spaghetti like home fries." ...
## chr [1:10] "Dana M." "Melanie U." "Ken N." "Landon C." "Missy H." ...
## chr [1:10] "11/30/2021" "7/21/2021" "9/24/2020" "7/22/2021" "10/24/2021" ...
## chr [1:10] "" "" "" "" "" "" "" "" "" ""
## chr [1:10] "Fabulous gluten free Monte Cristi. Huge pancakes aabd large portions of spaghetti like home fries." ...
## chr [1:10] "Dana M." "Melanie U." "Ken N." "Landon C." "Missy H." ...
## chr [1:10] "11/30/2021" "7/21/2021" "9/24/2020" "7/22/2021" "10/24/2021" ...
## chr [1:10] "" "" "" "" "" "" "" "" "" ""
## chr [1:10] "Fabulous gluten free Monte Cristi. Huge pancakes aabd large portions of spaghetti like home fries." ...
review = tibble(reviewer, review_date, score, comments)
Email: alcornmp@miamioh.edu↩︎
Email: davayipd@miamioh.edu↩︎
Email: henchkm@miamioh.edu↩︎