load the zillow website
library(rvest)
url<-c("https://www.mfilipski.com/random/zillow")
zillow <- read_html(url)
str(zillow)
## List of 2
## $ node:<externalptr>
## $ doc :<externalptr>
## - attr(*, "class")= chr [1:2] "xml_document" "xml_node"
Extract the following elements from the fake page: - Price of the house - Details of the house (bathrooms, bedrooms, square footage, anything else you want)
library(tidyverse)
library(stringr)
price_house <- function(html){html %>%
html_nodes('.list-card-price') %>%
html_text() %>% str_trim()
}
detail_house <- function(html){html %>%
html_nodes('.list-card-details') %>%
html_text() %>% str_trim()
}
price <- price_house(zillow)
details <- detail_house(zillow)
df <- data.frame(price,details)
df
## price details
## 1 $174,999 4 bds3 ba1,524 sqft- House for sale
## 2 $210,000 3 bds2 ba1,171 sqft- House for sale
## 3 $479,900 3 bds2 ba2,318 sqft- House for sale
## 4 $330,000 4 bds3 ba2,238 sqft- House for sale
## 5 $290,000 4 bds3 ba2,213 sqft- House for sale
## 6 $319,900 3 bds2 ba-- sqft- House for sale
## 7 $115,000 2 bds2 ba1,540 sqft- Condo for sale
## 8 $259,900 3 bds2 ba1,582 sqft- Condo for sale
## 9 $225,000 2 bds3 ba-- sqft- Condo for sale
## 10 $875,000 5 bds5 ba5,701 sqft- House for sale
## 11 $150,000 1 bd1 ba867 sqft- House for sale
# bathrooms, bedrooms and square footage.
df = df %>%
mutate(bedrooms = as.integer(str_trim(str_extract(details, "[\\d ]*(?=bds)")))) %>%
mutate(bathrooms = as.integer(str_trim(str_extract(details, "[\\d ]*(?=ba)"))))%>%
mutate(sqft = str_trim(str_extract(details, "[\\d ,]*(?=sqft)"))) %>%
mutate(sqft = as.numeric(str_replace(sqft,",","")))
df$price <- as.numeric(gsub("[^0-9.]+", "", df$price))
df
## price details bedrooms bathrooms sqft
## 1 174999 4 bds3 ba1,524 sqft- House for sale 4 3 1524
## 2 210000 3 bds2 ba1,171 sqft- House for sale 3 2 1171
## 3 479900 3 bds2 ba2,318 sqft- House for sale 3 2 2318
## 4 330000 4 bds3 ba2,238 sqft- House for sale 4 3 2238
## 5 290000 4 bds3 ba2,213 sqft- House for sale 4 3 2213
## 6 319900 3 bds2 ba-- sqft- House for sale 3 2 NA
## 7 115000 2 bds2 ba1,540 sqft- Condo for sale 2 2 1540
## 8 259900 3 bds2 ba1,582 sqft- Condo for sale 3 2 1582
## 9 225000 2 bds3 ba-- sqft- Condo for sale 2 3 NA
## 10 875000 5 bds5 ba5,701 sqft- House for sale 5 5 5701
## 11 150000 1 bd1 ba867 sqft- House for sale NA 1 867
ggplot(data=df)+
geom_point(aes(x=sqft,size=sqft,y=price))+
labs(titles = "Price and Square Footage")
The price and square footage has a positive linear relationship. The higher square footage, the higher price.
ggplot(data=df)+
geom_point(aes(x=bedrooms,size=bathrooms,y=price))+
labs(titles = "Price and bedrooms & bathrooms")
When the bedroom number is larger, the price is higher.
library(stargazer)
g <- lm(price ~ bedrooms + bathrooms + sqft,data=df)
stargazer(g, title="regression results", type = "text")
##
## regression results
## ===============================================
## Dependent variable:
## ---------------------------
## price
## -----------------------------------------------
## bedrooms 143,901.900**
## (32,939.320)
##
## bathrooms -254,137.900***
## (46,363.980)
##
## sqft 257.566***
## (21.341)
##
## Constant -51,690.340
## (52,284.740)
##
## -----------------------------------------------
## Observations 8
## R2 0.990
## Adjusted R2 0.982
## Residual Std. Error 32,574.450 (df = 4)
## F Statistic 127.319*** (df = 3; 4)
## ===============================================
## Note: *p<0.1; **p<0.05; ***p<0.01
Both bedrooms number and square footage of a house has a significant positive effect on housing price while bathroom has a negative effect on the price.
I choose to scrap New York times best selling books
library(rvest)
url <- "https://www.nytimes.com/books/best-sellers/hardcover-fiction/"
ny <- read_html(url)
book_titles <- ny %>% html_nodes(".css-5pe77f") %>% html_text()
books <- data.frame(book_titles)
books
## book_titles
## 1 LESSONS IN CHEMISTRY
## 2 HOMECOMING
## 3 ROMANTIC COMEDY
## 4 HANG THE MOON
## 5 TOMORROW, AND TOMORROW, AND TOMORROW
## 6 HELLO BEAUTIFUL
## 7 TRESS OF THE EMERALD SEA
## 8 THE SOULMATE
## 9 I WILL FIND YOU
## 10 COUNTDOWN
## 11 PINEAPPLE STREET
## 12 DEMON COPPERHEAD
## 13 REMARKABLY BRIGHT CREATURES
## 14 CAMP ZERO
## 15 ABOVE GROUND