library(rvest) library(xml2)
imdb = read_html(url)
rank_html <- html_nodes(imdb,‘.text-primary’) rank <- html_text(rank_html) rank #rank is in string data type
as.numeric(rank)
rank <- as.numeric(rank) # reasigning rank rank
title_html <- html_nodes(imdb,‘.lister-item-header a’) title <- html_text(title_html) title
desc_html <- html_nodes(imdb,‘.ratings-bar+ .text-muted’) # first search for ratings-bar, then after scrape info from .text-muted
desc <- html_text(desc_html)
desc
desc_new <- gsub(“”,"",desc) # replacing followed by double space with nothing
runtime_html <- html_nodes(imdb,‘.runtime’) runtime <- html_text(runtime_html) runtime_num <- gsub(" min“,”",runtime) runtime_num <- as.numeric(runtime_num) runtime_num
genre_html <- html_nodes(imdb,‘.genre’) genre <- html_text(genre_html) genre <- gsub(“”,"“,genre) gsub(” “,”",genre) genre ##################################
ratings_html <- html_nodes(imdb,‘.ratings-bar strong’) ratings <- html_text(ratings_html) ratings <- as.numeric(ratings) ratings
votes_html <- html_nodes(imdb,‘.sort-num_votes-visible span:nth-child(2)’) votes <- html_text(votes_html) votes <- gsub(“,”,"",votes) votes <- as.numeric(votes) votes ################################
director_html <- html_nodes(imdb,‘.text-muted+ p a:nth-child(1)’) director <- html_text(director_html) director
actor_html <- html_nodes(imdb, ‘.lister-item-content .ghost+ a’) actor <- html_text(actor_html) actor
urls = list(‘https://www.imdb.com/search/title/?title_type=feature&release_date=2020-03-01,2021-01-31&user_rating=1.0,10.0&num_votes=1,1000000&languages=en&count=250&start=251&ref_=adv_nxt’, ‘https://www.imdb.com/search/title/?title_type=feature&release_date=2020-03-01,2021-01-31&user_rating=1.0,10.0&num_votes=1,1000000&languages=en&count=250&start=501&ref_=adv_nxt’, ‘https://www.imdb.com/search/title/?title_type=feature&release_date=2020-03-01,2021-01-31&user_rating=1.0,10.0&num_votes=1,1000000&languages=en&count=250&start=751&ref_=adv_nxt’, ‘https://www.imdb.com/search/title/?title_type=feature&release_date=2020-03-01,2021-01-31&user_rating=1.0,10.0&num_votes=1,1000000&languages=en&count=250&start=1001&ref_=adv_nxt’, ‘https://www.imdb.com/search/title/?title_type=feature&release_date=2020-03-01,2021-01-31&user_rating=1.0,10.0&num_votes=1,1000000&languages=en&count=250&start=1251&ref_=adv_nxt’)
rank_list = c()
for (i in urls) { # rank imdb_html = read_html(i) rank_html2 <- html_nodes(imdb_html,‘.text-primary’) rank2 <- html_text(rank_html2) rank2 <- as.numeric(rank2) print(rank2) append(rank_list,rank2)
}
imdb_data = data.frame(Rank = rank, Title = title, Desc = desc, Runtime = runtime_num, Ratings = ratings, Votes = votes, Director = director, Actor = actor) View(imdb_data) str(imdb_data) ###################################### # EDA library(‘ggplot2’)
qplot(data = imdb_data, Runtime, bins = 30)
ggplot(imdb_data, aes(x = Runtime, y = Ratings)) + geom_point()
ggplot(imdb_data, aes(x = Runtime, y = votes)) + geom_point()
ggplot(imdb_data, aes(x = Ratings, y = votes)) + geom_point() #########################################
imdb_data\(Desc imdb_data\)Title
install.packages(“tm”) library(tm) Corpus <- VCorpus(VectorSource(imdb_data$Desc)) as.character(Corpus[[1]]) as.character(Corpus[[10]])
lapply(Corpus[1:5],as.character)
Corpus_clean <- tm_map(Corpus, content_transformer(tolower))
as.character(Corpus[[10]]) as.character(Corpus_clean[[10]])
Corpus_clean = tm_map(Corpus_clean, removeNumbers)
as.character(Corpus[[7]]) as.character(Corpus_clean[[7]])
mystopwords = stopwords() Corpus_clean <- tm_map(Corpus_clean,removeWords,mystopwords)
as.character(Corpus[[7]]) as.character(Corpus_clean[[7]])
stopword_list = c(“will”,“one”)
Corpus_clean <- tm_map(Corpus_clean,removeWords,stopword_list)
as.character(Corpus[[1]]) as.character(Corpus_clean[[1]])
#Remove punctuation Corpus_clean <- tm_map(Corpus_clean, removePunctuation) as.character(Corpus[[1]]) as.character(Corpus_clean[[1]])
install.packages(“SnowballC”,dependencies = TRUE) library(SnowballC)
example = c(“learning”, “learn”, “learned”,“learnt”) Corpus_clean <- tm_map(Corpus_clean,stemDocument)
as.character(Corpus[[1]]) as.character(Corpus_clean[[1]])
library(tm) library(SnowballC) Corpus_clean <- tm_map(Corpus_clean, stripWhitespace)
as.character(Corpus[[7]]) as.character(Corpus_clean[[7]])
install.packages(“wordcloud”) library(wordcloud) wordcloud(Corpus_clean, max.words = 50)