library(xml2)
library(rvest)
library(ggplot2)
webpage <- read_html('https://www.imdb.com/search/title/?release_date=2000-01-01,2020-01-01&genres=animation&sort=num_votes,desc&count=100')
webpage
## {html_document}
## <html xmlns:og="http://ogp.me/ns#" xmlns:fb="http://www.facebook.com/2008/fbml">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
## [2] <body id="styleguide-v2" class="fixed">\n <img height="1" widt ...
runtime_data_webpage <- html_nodes(webpage,'.runtime')
head(runtime_data_webpage) # untuk mempersingkat halaman, kita hanya tampilkan 6 data teratas
## {xml_nodeset (6)}
## [1] <span class="runtime">98 min</span>
## [2] <span class="runtime">100 min</span>
## [3] <span class="runtime">96 min</span>
## [4] <span class="runtime">92 min</span>
## [5] <span class="runtime">103 min</span>
## [6] <span class="runtime">125 min</span>
runtime_data <- html_text(runtime_data_webpage) # kita ambil data text
head(runtime_data)
## [1] "98 min" "100 min" "96 min" "92 min" "103 min" "125 min"
runtime_data <- gsub(" min","",runtime_data) # kita hilangkan min
runtime_data
## [1] "98" "100" "96" "92" "103" "125" "98" "115" "111" "95" "90" "102"
## [13] "95" "117" "108" "105" "81" "23" "92" "93" "102" "100" "101" "117"
## [25] "93" "86" "98" "119" "104" "100" "24" "102" "87" "107" "24" "93"
## [37] "23" "118" "90" "91" "97" "77" "107" "95" "91" "106" "94" "100"
## [49] "118" "91" "107" "90" "87" "96" "89" "100" "98" "96" "88" "100"
## [61] "93" "87" "78" "107" "89" "84" "85" "108" "106" "90" "95" "92"
## [73] "83" "97" "115" "30" "103" "101" "108" "94" "91" "90" "24" "112"
## [85] "95" "22" "24" "104" "91" "25" "96" "97" "101" "79" "86" "15"
## [97] "85" "89" "101" "22"
runtime_data<-as.numeric(runtime_data) # data text dikonversi menjadi numerik
runtime_data
## [1] 98 100 96 92 103 125 98 115 111 95 90 102 95 117 108 105 81 23
## [19] 92 93 102 100 101 117 93 86 98 119 104 100 24 102 87 107 24 93
## [37] 23 118 90 91 97 77 107 95 91 106 94 100 118 91 107 90 87 96
## [55] 89 100 98 96 88 100 93 87 78 107 89 84 85 108 106 90 95 92
## [73] 83 97 115 30 103 101 108 94 91 90 24 112 95 22 24 104 91 25
## [91] 96 97 101 79 86 15 85 89 101 22
genre_data_webpage <- html_nodes(webpage,'.genre')
head(genre_data_webpage) # untuk mempersingkat halaman, kita hanya tampilkan 6 data teratas
## {xml_nodeset (6)}
## [1] <span class="genre">\nAnimation, Adventure, Family </span>
## [2] <span class="genre">\nAnimation, Adventure, Comedy </span>
## [3] <span class="genre">\nAnimation, Adventure, Comedy </span>
## [4] <span class="genre">\nAnimation, Adventure, Comedy </span>
## [5] <span class="genre">\nAnimation, Adventure, Comedy </span>
## [6] <span class="genre">\nAnimation, Adventure, Family </span>
genre_data <- html_text(genre_data_webpage) # kita ambil data text
head(genre_data)
## [1] "\nAnimation, Adventure, Family "
## [2] "\nAnimation, Adventure, Comedy "
## [3] "\nAnimation, Adventure, Comedy "
## [4] "\nAnimation, Adventure, Comedy "
## [5] "\nAnimation, Adventure, Comedy "
## [6] "\nAnimation, Adventure, Family "
genre_data<-gsub("Animation,","",genre_data) #kita ambil genre selain animation
head(genre_data)
## [1] "\n Adventure, Family " "\n Adventure, Comedy "
## [3] "\n Adventure, Comedy " "\n Adventure, Comedy "
## [5] "\n Adventure, Comedy " "\n Adventure, Family "
genre_data<-gsub("\n","",genre_data) # kita hilangkan tanda "\n"
head(genre_data)
## [1] " Adventure, Family " " Adventure, Comedy "
## [3] " Adventure, Comedy " " Adventure, Comedy "
## [5] " Adventure, Comedy " " Adventure, Family "
genre_data<-gsub(" ","",genre_data) # kita hilangkan juga spasi kosong
head(genre_data)
## [1] "Adventure,Family" "Adventure,Comedy" "Adventure,Comedy" "Adventure,Comedy"
## [5] "Adventure,Comedy" "Adventure,Family"
genre_data<-gsub(",.*","",genre_data) # setiap film dikategorikan pada deskripsi genre yang pertama saja
genre_data
## [1] "Adventure" "Adventure" "Adventure" "Adventure" "Adventure" "Adventure"
## [7] "Action" "Action" "Adventure" "Adventure" "Adventure" "Adventure"
## [13] "Adventure" "Action" "Adventure" "Adventure" "Adventure" "Adventure"
## [19] "Action" "Adventure" "Action" "Adventure" "Adventure" "Adventure"
## [25] "Adventure" "Adventure" "Adventure" "Adventure" "Adventure" "Action"
## [31] "Action" "Action" "Adventure" "Adventure" "Crime" "Adventure"
## [37] "Action" "Action" "Action" "Adventure" "Adventure" "Drama"
## [43] "Action" "Action" "Comedy" "Drama" "Adventure" "Adventure"
## [49] "Adventure" "Adventure" "Action" "Adventure" "Adventure" "Adventure"
## [55] "Adventure" "Drama" "Adventure" "Adventure" "Adventure" "Adventure"
## [61] "Adventure" "Adventure" "Adventure" "Adventure" "Adventure" "Adventure"
## [67] "Adventure" "Adventure" "Adventure" "Adventure" "Adventure" "Comedy"
## [73] "Adventure" "Action" "Action" "Drama" "Adventure" "Adventure"
## [79] "Comedy" "Action" "Adventure" "Adventure" "Action" "Adventure"
## [85] "Action" "Action" "Action" "Action" "Adventure" "Comedy"
## [91] "Adventure" "Adventure" "Adventure" "Action" "Adventure" "Short"
## [97] "Adventure" "Adventure" "Action" "Comedy"
genre_data<-as.factor(genre_data) # data text dikonversi menjadi data faktor
head(genre_data)
## [1] Adventure Adventure Adventure Adventure Adventure Adventure
## Levels: Action Adventure Comedy Crime Drama Short
rating_data_webpage <- html_nodes(webpage,'.ratings-imdb-rating strong')
head(rating_data_webpage) # untuk mempersingkat halaman, kita hanya tampilkan 6 data teratas
## {xml_nodeset (6)}
## [1] <strong>8.4</strong>
## [2] <strong>8.2</strong>
## [3] <strong>8.3</strong>
## [4] <strong>8.1</strong>
## [5] <strong>8.3</strong>
## [6] <strong>8.6</strong>
rating_data <- html_text(rating_data_webpage) # kita ambil data text
rating_data
## [1] "8.4" "8.2" "8.3" "8.1" "8.3" "8.6" "8.1" "8.0" "8.1" "8.2" "7.9" "7.5"
## [13] "7.6" "8.4" "8.0" "8.4" "7.5" "9.2" "7.6" "7.3" "7.8" "7.7" "7.7" "7.2"
## [25] "7.1" "6.9" "7.3" "8.2" "7.3" "7.7" "9.1" "7.8" "7.3" "7.6" "9.0" "6.1"
## [37] "9.3" "7.6" "7.2" "6.8" "7.3" "7.3" "7.2" "7.3" "7.0" "8.4" "6.9" "7.7"
## [49] "6.8" "6.4" "7.3" "6.9" "7.9" "6.9" "6.6" "7.7" "7.2" "6.9" "6.5" "6.6"
## [61] "6.3" "6.5" "7.4" "7.1" "6.1" "7.1" "7.3" "6.4" "6.2" "6.0" "6.8" "8.1"
## [73] "6.7" "7.2" "6.3" "9.4" "6.8" "7.8" "7.1" "6.5" "6.1" "6.6" "9.1" "7.0"
## [85] "7.1" "8.6" "8.8" "7.3" "6.4" "8.8" "8.1" "7.1" "7.6" "7.0" "6.8" "8.4"
## [97] "7.5" "6.2" "7.8" "7.4"
rating_data<-as.numeric(rating_data) # data text dikonversi menjadi numerik
rating_data
## [1] 8.4 8.2 8.3 8.1 8.3 8.6 8.1 8.0 8.1 8.2 7.9 7.5 7.6 8.4 8.0 8.4 7.5 9.2
## [19] 7.6 7.3 7.8 7.7 7.7 7.2 7.1 6.9 7.3 8.2 7.3 7.7 9.1 7.8 7.3 7.6 9.0 6.1
## [37] 9.3 7.6 7.2 6.8 7.3 7.3 7.2 7.3 7.0 8.4 6.9 7.7 6.8 6.4 7.3 6.9 7.9 6.9
## [55] 6.6 7.7 7.2 6.9 6.5 6.6 6.3 6.5 7.4 7.1 6.1 7.1 7.3 6.4 6.2 6.0 6.8 8.1
## [73] 6.7 7.2 6.3 9.4 6.8 7.8 7.1 6.5 6.1 6.6 9.1 7.0 7.1 8.6 8.8 7.3 6.4 8.8
## [91] 8.1 7.1 7.6 7.0 6.8 8.4 7.5 6.2 7.8 7.4
gross_data_webpage <- html_nodes(webpage,'.ghost~ .text-muted+ span')
head(gross_data_webpage) # untuk mempersingkat halaman, kita hanya tampilkan 6 data teratas
## {xml_nodeset (6)}
## [1] <span name="nv" data-value="223,808,164">$223.81M</span>
## [2] <span name="nv" data-value="380,843,261">$380.84M</span>
## [3] <span name="nv" data-value="293,004,164">$293.00M</span>
## [4] <span name="nv" data-value="289,916,256">$289.92M</span>
## [5] <span name="nv" data-value="415,004,880">$415.00M</span>
## [6] <span name="nv" data-value="10,055,859">$10.06M</span>
gross_data <- html_text(gross_data_webpage) # kita ambil data text
gross_data
## [1] "$223.81M" "$380.84M" "$293.00M" "$289.92M" "$415.00M" "$10.06M"
## [7] "$217.58M" "$261.44M" "$206.45M" "$356.46M" "$267.67M" "$400.74M"
## [13] "$251.51M" "$190.24M" "$341.27M" "$209.73M" "$176.39M" "$215.43M"
## [19] "$436.47M" "$222.53M" "$200.82M" "$189.42M" "$244.08M" "$237.28M"
## [25] "$193.60M" "$368.06M" "$4.71M" "$268.49M" "$257.76M" "$177.00M"
## [31] "$183.14M" "$248.76M" "$320.71M" "$608.58M" "$165.25M" "$195.33M"
## [37] "$486.30M" "$53.36M" "$123.48M" "$148.42M" "$148.31M" "$5.02M"
## [43] "$196.57M" "$434.04M" "$543.64M" "$336.05M" "$77.59M" "$124.87M"
## [49] "$21.00M" "$143.62M" "$180.01M" "$75.29M" "$187.17M" "$114.05M"
## [55] "$161.32M" "$183.37M" "$238.37M" "$368.38M" "$89.30M" "$127.81M"
## [61] "$97.69M" "$106.83M" "$145.79M" "$198.00M" "$191.45M" "$160.86M"
## [67] "$216.39M" "$155.02M" "$103.41M" "$82.16M" "$477.37M" "$32.02M"
## [73] "$270.40M" "$198.35M" "$126.63M" "$149.26M" "$201.09M" "$143.53M"
## [79] "$175.75M" "$128.20M" "$104.40M" "$15.09M" "$31.74M" "$154.53M"
## [85] "$56.11M" "$264.62M" "$48.02M"
gross_data<-gsub("M","",gross_data) # menghilangkan huruf M dan $
gross_data<-substring(gross_data,2,6)
gross_data
## [1] "223.8" "380.8" "293.0" "289.9" "415.0" "10.06" "217.5" "261.4" "206.4"
## [10] "356.4" "267.6" "400.7" "251.5" "190.2" "341.2" "209.7" "176.3" "215.4"
## [19] "436.4" "222.5" "200.8" "189.4" "244.0" "237.2" "193.6" "368.0" "4.71"
## [28] "268.4" "257.7" "177.0" "183.1" "248.7" "320.7" "608.5" "165.2" "195.3"
## [37] "486.3" "53.36" "123.4" "148.4" "148.3" "5.02" "196.5" "434.0" "543.6"
## [46] "336.0" "77.59" "124.8" "21.00" "143.6" "180.0" "75.29" "187.1" "114.0"
## [55] "161.3" "183.3" "238.3" "368.3" "89.30" "127.8" "97.69" "106.8" "145.7"
## [64] "198.0" "191.4" "160.8" "216.3" "155.0" "103.4" "82.16" "477.3" "32.02"
## [73] "270.4" "198.3" "126.6" "149.2" "201.0" "143.5" "175.7" "128.2" "104.4"
## [82] "15.09" "31.74" "154.5" "56.11" "264.6" "48.02"
length(gross_data) # cek jumlah data gross_data, karena tidak semua film memuat gross data
## [1] 87
# Missing data diganti dengan nilai NA
for (i in c(18,31,35,37,72,76,83,86,87,90,91,96,100)){
x<-gross_data[1:(i-1)]
y<-gross_data[i:length(gross_data)]
gross_data<-append(x,list("NA"))
gross_data<-append(gross_data,y)
}
gross_data<- as.numeric(unlist(gross_data)) # data gross dikonversi menjadi numerik
## Warning: NAs introduced by coercion
gross_data
## [1] 223.80 380.80 293.00 289.90 415.00 10.06 217.50 261.40 206.40 356.40
## [11] 267.60 400.70 251.50 190.20 341.20 209.70 176.30 NA 215.40 436.40
## [21] 222.50 200.80 189.40 244.00 237.20 193.60 368.00 4.71 268.40 257.70
## [31] NA 177.00 183.10 248.70 NA 320.70 NA 608.50 165.20 195.30
## [41] 486.30 53.36 123.40 148.40 148.30 5.02 196.50 434.00 543.60 336.00
## [51] 77.59 124.80 21.00 143.60 180.00 75.29 187.10 114.00 161.30 183.30
## [61] 238.30 368.30 89.30 127.80 97.69 106.80 145.70 198.00 191.40 160.80
## [71] 216.30 NA 155.00 103.40 82.16 NA 477.30 32.02 270.40 198.30
## [81] 126.60 149.20 NA 201.00 143.50 NA NA 175.70 128.20 NA
## [91] NA 104.40 15.09 31.74 154.50 NA 56.11 264.60 48.02 NA
## [101] 48.02
length(gross_data)
## [1] 101
gross_data <- gross_data[-101] #mengeluarkan data ke 101
gross_data
## [1] 223.80 380.80 293.00 289.90 415.00 10.06 217.50 261.40 206.40 356.40
## [11] 267.60 400.70 251.50 190.20 341.20 209.70 176.30 NA 215.40 436.40
## [21] 222.50 200.80 189.40 244.00 237.20 193.60 368.00 4.71 268.40 257.70
## [31] NA 177.00 183.10 248.70 NA 320.70 NA 608.50 165.20 195.30
## [41] 486.30 53.36 123.40 148.40 148.30 5.02 196.50 434.00 543.60 336.00
## [51] 77.59 124.80 21.00 143.60 180.00 75.29 187.10 114.00 161.30 183.30
## [61] 238.30 368.30 89.30 127.80 97.69 106.80 145.70 198.00 191.40 160.80
## [71] 216.30 NA 155.00 103.40 82.16 NA 477.30 32.02 270.40 198.30
## [81] 126.60 149.20 NA 201.00 143.50 NA NA 175.70 128.20 NA
## [91] NA 104.40 15.09 31.74 154.50 NA 56.11 264.60 48.02 NA
length(gross_data)
## [1] 100
summary(gross_data)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 4.71 127.20 190.20 204.93 259.55 608.50 13
data_lengkap_film <-data.frame(Runtime = runtime_data, Genre = genre_data, Rating = rating_data,Gross_Pendapatan = gross_data)
str(data_lengkap_film)
## 'data.frame': 100 obs. of 4 variables:
## $ Runtime : num 98 100 96 92 103 125 98 115 111 95 ...
## $ Genre : Factor w/ 6 levels "Action","Adventure",..: 2 2 2 2 2 2 1 1 2 2 ...
## $ Rating : num 8.4 8.2 8.3 8.1 8.3 8.6 8.1 8 8.1 8.2 ...
## $ Gross_Pendapatan: num 224 381 293 290 415 ...
length(data_lengkap_film)
## [1] 4
summary(data_lengkap_film)
## Runtime Genre Rating Gross_Pendapatan
## Min. : 15.00 Action :24 Min. :6.000 Min. : 4.71
## 1st Qu.: 88.75 Adventure:65 1st Qu.:6.900 1st Qu.:127.20
## Median : 95.00 Comedy : 5 Median :7.300 Median :190.20
## Mean : 89.39 Crime : 1 Mean :7.460 Mean :204.93
## 3rd Qu.:102.00 Drama : 4 3rd Qu.:8.025 3rd Qu.:259.55
## Max. :125.00 Short : 1 Max. :9.400 Max. :608.50
## NA's :13
qplot(data = data_lengkap_film,Runtime,fill = Genre,bins = 30)

ggplot(data_lengkap_film,aes(x=Runtime,y=Gross_Pendapatan))+ geom_point(aes(size=Rating,col=Genre))
## Warning: Removed 13 rows containing missing values (geom_point).
