Tugas IoT - Web Scrapping

library(xml2)
library(rvest)
library(ggplot2)

webpage <- read_html('https://www.imdb.com/search/title/?release_date=2000-01-01,2020-01-01&genres=animation&sort=num_votes,desc&count=100')
webpage

## {html_document}
## <html xmlns:og="http://ogp.me/ns#" xmlns:fb="http://www.facebook.com/2008/fbml">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
## [2] <body id="styleguide-v2" class="fixed">\n            <img height="1" widt ...

runtime_data_webpage <- html_nodes(webpage,'.runtime')
head(runtime_data_webpage) # untuk mempersingkat halaman, kita hanya tampilkan 6 data teratas

## {xml_nodeset (6)}
## [1] <span class="runtime">98 min</span>
## [2] <span class="runtime">100 min</span>
## [3] <span class="runtime">96 min</span>
## [4] <span class="runtime">92 min</span>
## [5] <span class="runtime">103 min</span>
## [6] <span class="runtime">125 min</span>

runtime_data <- html_text(runtime_data_webpage) # kita ambil data text
head(runtime_data)

## [1] "98 min"  "100 min" "96 min"  "92 min"  "103 min" "125 min"

runtime_data <- gsub(" min","",runtime_data) # kita hilangkan min
runtime_data

##   [1] "98"  "100" "96"  "92"  "103" "125" "98"  "115" "111" "95"  "90"  "102"
##  [13] "95"  "117" "108" "105" "81"  "23"  "92"  "93"  "102" "100" "101" "117"
##  [25] "93"  "86"  "98"  "119" "104" "100" "24"  "102" "87"  "107" "24"  "93" 
##  [37] "23"  "118" "90"  "91"  "97"  "77"  "107" "95"  "91"  "106" "94"  "100"
##  [49] "118" "91"  "107" "90"  "87"  "96"  "89"  "100" "98"  "96"  "88"  "100"
##  [61] "93"  "87"  "78"  "107" "89"  "84"  "85"  "108" "106" "90"  "95"  "92" 
##  [73] "83"  "97"  "115" "30"  "103" "101" "108" "94"  "91"  "90"  "24"  "112"
##  [85] "95"  "22"  "24"  "104" "91"  "25"  "96"  "97"  "101" "79"  "86"  "15" 
##  [97] "85"  "89"  "101" "22"

runtime_data<-as.numeric(runtime_data) # data text dikonversi menjadi numerik
runtime_data

##   [1]  98 100  96  92 103 125  98 115 111  95  90 102  95 117 108 105  81  23
##  [19]  92  93 102 100 101 117  93  86  98 119 104 100  24 102  87 107  24  93
##  [37]  23 118  90  91  97  77 107  95  91 106  94 100 118  91 107  90  87  96
##  [55]  89 100  98  96  88 100  93  87  78 107  89  84  85 108 106  90  95  92
##  [73]  83  97 115  30 103 101 108  94  91  90  24 112  95  22  24 104  91  25
##  [91]  96  97 101  79  86  15  85  89 101  22

genre_data_webpage <- html_nodes(webpage,'.genre')
head(genre_data_webpage) # untuk mempersingkat halaman, kita hanya tampilkan 6 data teratas

## {xml_nodeset (6)}
## [1] <span class="genre">\nAnimation, Adventure, Family            </span>
## [2] <span class="genre">\nAnimation, Adventure, Comedy            </span>
## [3] <span class="genre">\nAnimation, Adventure, Comedy            </span>
## [4] <span class="genre">\nAnimation, Adventure, Comedy            </span>
## [5] <span class="genre">\nAnimation, Adventure, Comedy            </span>
## [6] <span class="genre">\nAnimation, Adventure, Family            </span>

genre_data <- html_text(genre_data_webpage) # kita ambil data text 
head(genre_data)

## [1] "\nAnimation, Adventure, Family            "
## [2] "\nAnimation, Adventure, Comedy            "
## [3] "\nAnimation, Adventure, Comedy            "
## [4] "\nAnimation, Adventure, Comedy            "
## [5] "\nAnimation, Adventure, Comedy            "
## [6] "\nAnimation, Adventure, Family            "

genre_data<-gsub("Animation,","",genre_data) #kita ambil genre selain animation
head(genre_data)

## [1] "\n Adventure, Family            " "\n Adventure, Comedy            "
## [3] "\n Adventure, Comedy            " "\n Adventure, Comedy            "
## [5] "\n Adventure, Comedy            " "\n Adventure, Family            "

genre_data<-gsub("\n","",genre_data) # kita hilangkan tanda "\n" 
head(genre_data)

## [1] " Adventure, Family            " " Adventure, Comedy            "
## [3] " Adventure, Comedy            " " Adventure, Comedy            "
## [5] " Adventure, Comedy            " " Adventure, Family            "

genre_data<-gsub(" ","",genre_data) # kita hilangkan juga spasi kosong
head(genre_data)

## [1] "Adventure,Family" "Adventure,Comedy" "Adventure,Comedy" "Adventure,Comedy"
## [5] "Adventure,Comedy" "Adventure,Family"

genre_data<-gsub(",.*","",genre_data) # setiap film dikategorikan pada deskripsi genre yang pertama saja
genre_data

##   [1] "Adventure" "Adventure" "Adventure" "Adventure" "Adventure" "Adventure"
##   [7] "Action"    "Action"    "Adventure" "Adventure" "Adventure" "Adventure"
##  [13] "Adventure" "Action"    "Adventure" "Adventure" "Adventure" "Adventure"
##  [19] "Action"    "Adventure" "Action"    "Adventure" "Adventure" "Adventure"
##  [25] "Adventure" "Adventure" "Adventure" "Adventure" "Adventure" "Action"   
##  [31] "Action"    "Action"    "Adventure" "Adventure" "Crime"     "Adventure"
##  [37] "Action"    "Action"    "Action"    "Adventure" "Adventure" "Drama"    
##  [43] "Action"    "Action"    "Comedy"    "Drama"     "Adventure" "Adventure"
##  [49] "Adventure" "Adventure" "Action"    "Adventure" "Adventure" "Adventure"
##  [55] "Adventure" "Drama"     "Adventure" "Adventure" "Adventure" "Adventure"
##  [61] "Adventure" "Adventure" "Adventure" "Adventure" "Adventure" "Adventure"
##  [67] "Adventure" "Adventure" "Adventure" "Adventure" "Adventure" "Comedy"   
##  [73] "Adventure" "Action"    "Action"    "Drama"     "Adventure" "Adventure"
##  [79] "Comedy"    "Action"    "Adventure" "Adventure" "Action"    "Adventure"
##  [85] "Action"    "Action"    "Action"    "Action"    "Adventure" "Comedy"   
##  [91] "Adventure" "Adventure" "Adventure" "Action"    "Adventure" "Short"    
##  [97] "Adventure" "Adventure" "Action"    "Comedy"

genre_data<-as.factor(genre_data) # data text dikonversi menjadi data faktor
head(genre_data)

## [1] Adventure Adventure Adventure Adventure Adventure Adventure
## Levels: Action Adventure Comedy Crime Drama Short

rating_data_webpage <- html_nodes(webpage,'.ratings-imdb-rating strong')
head(rating_data_webpage) # untuk mempersingkat halaman, kita hanya tampilkan 6 data teratas

## {xml_nodeset (6)}
## [1] <strong>8.4</strong>
## [2] <strong>8.2</strong>
## [3] <strong>8.3</strong>
## [4] <strong>8.1</strong>
## [5] <strong>8.3</strong>
## [6] <strong>8.6</strong>

rating_data <- html_text(rating_data_webpage) # kita ambil data text 
rating_data

##   [1] "8.4" "8.2" "8.3" "8.1" "8.3" "8.6" "8.1" "8.0" "8.1" "8.2" "7.9" "7.5"
##  [13] "7.6" "8.4" "8.0" "8.4" "7.5" "9.2" "7.6" "7.3" "7.8" "7.7" "7.7" "7.2"
##  [25] "7.1" "6.9" "7.3" "8.2" "7.3" "7.7" "9.1" "7.8" "7.3" "7.6" "9.0" "6.1"
##  [37] "9.3" "7.6" "7.2" "6.8" "7.3" "7.3" "7.2" "7.3" "7.0" "8.4" "6.9" "7.7"
##  [49] "6.8" "6.4" "7.3" "6.9" "7.9" "6.9" "6.6" "7.7" "7.2" "6.9" "6.5" "6.6"
##  [61] "6.3" "6.5" "7.4" "7.1" "6.1" "7.1" "7.3" "6.4" "6.2" "6.0" "6.8" "8.1"
##  [73] "6.7" "7.2" "6.3" "9.4" "6.8" "7.8" "7.1" "6.5" "6.1" "6.6" "9.1" "7.0"
##  [85] "7.1" "8.6" "8.8" "7.3" "6.4" "8.8" "8.1" "7.1" "7.6" "7.0" "6.8" "8.4"
##  [97] "7.5" "6.2" "7.8" "7.4"

rating_data<-as.numeric(rating_data) # data text dikonversi menjadi numerik
rating_data

##   [1] 8.4 8.2 8.3 8.1 8.3 8.6 8.1 8.0 8.1 8.2 7.9 7.5 7.6 8.4 8.0 8.4 7.5 9.2
##  [19] 7.6 7.3 7.8 7.7 7.7 7.2 7.1 6.9 7.3 8.2 7.3 7.7 9.1 7.8 7.3 7.6 9.0 6.1
##  [37] 9.3 7.6 7.2 6.8 7.3 7.3 7.2 7.3 7.0 8.4 6.9 7.7 6.8 6.4 7.3 6.9 7.9 6.9
##  [55] 6.6 7.7 7.2 6.9 6.5 6.6 6.3 6.5 7.4 7.1 6.1 7.1 7.3 6.4 6.2 6.0 6.8 8.1
##  [73] 6.7 7.2 6.3 9.4 6.8 7.8 7.1 6.5 6.1 6.6 9.1 7.0 7.1 8.6 8.8 7.3 6.4 8.8
##  [91] 8.1 7.1 7.6 7.0 6.8 8.4 7.5 6.2 7.8 7.4

gross_data_webpage <- html_nodes(webpage,'.ghost~ .text-muted+ span')
head(gross_data_webpage) # untuk mempersingkat halaman, kita hanya tampilkan 6 data teratas

## {xml_nodeset (6)}
## [1] <span name="nv" data-value="223,808,164">$223.81M</span>
## [2] <span name="nv" data-value="380,843,261">$380.84M</span>
## [3] <span name="nv" data-value="293,004,164">$293.00M</span>
## [4] <span name="nv" data-value="289,916,256">$289.92M</span>
## [5] <span name="nv" data-value="415,004,880">$415.00M</span>
## [6] <span name="nv" data-value="10,055,859">$10.06M</span>

gross_data <- html_text(gross_data_webpage) # kita ambil data text 
gross_data

##  [1] "$223.81M" "$380.84M" "$293.00M" "$289.92M" "$415.00M" "$10.06M" 
##  [7] "$217.58M" "$261.44M" "$206.45M" "$356.46M" "$267.67M" "$400.74M"
## [13] "$251.51M" "$190.24M" "$341.27M" "$209.73M" "$176.39M" "$215.43M"
## [19] "$436.47M" "$222.53M" "$200.82M" "$189.42M" "$244.08M" "$237.28M"
## [25] "$193.60M" "$368.06M" "$4.71M"   "$268.49M" "$257.76M" "$177.00M"
## [31] "$183.14M" "$248.76M" "$320.71M" "$608.58M" "$165.25M" "$195.33M"
## [37] "$486.30M" "$53.36M"  "$123.48M" "$148.42M" "$148.31M" "$5.02M"  
## [43] "$196.57M" "$434.04M" "$543.64M" "$336.05M" "$77.59M"  "$124.87M"
## [49] "$21.00M"  "$143.62M" "$180.01M" "$75.29M"  "$187.17M" "$114.05M"
## [55] "$161.32M" "$183.37M" "$238.37M" "$368.38M" "$89.30M"  "$127.81M"
## [61] "$97.69M"  "$106.83M" "$145.79M" "$198.00M" "$191.45M" "$160.86M"
## [67] "$216.39M" "$155.02M" "$103.41M" "$82.16M"  "$477.37M" "$32.02M" 
## [73] "$270.40M" "$198.35M" "$126.63M" "$149.26M" "$201.09M" "$143.53M"
## [79] "$175.75M" "$128.20M" "$104.40M" "$15.09M"  "$31.74M"  "$154.53M"
## [85] "$56.11M"  "$264.62M" "$48.02M"

gross_data<-gsub("M","",gross_data) # menghilangkan huruf M dan $
gross_data<-substring(gross_data,2,6)
gross_data

##  [1] "223.8" "380.8" "293.0" "289.9" "415.0" "10.06" "217.5" "261.4" "206.4"
## [10] "356.4" "267.6" "400.7" "251.5" "190.2" "341.2" "209.7" "176.3" "215.4"
## [19] "436.4" "222.5" "200.8" "189.4" "244.0" "237.2" "193.6" "368.0" "4.71" 
## [28] "268.4" "257.7" "177.0" "183.1" "248.7" "320.7" "608.5" "165.2" "195.3"
## [37] "486.3" "53.36" "123.4" "148.4" "148.3" "5.02"  "196.5" "434.0" "543.6"
## [46] "336.0" "77.59" "124.8" "21.00" "143.6" "180.0" "75.29" "187.1" "114.0"
## [55] "161.3" "183.3" "238.3" "368.3" "89.30" "127.8" "97.69" "106.8" "145.7"
## [64] "198.0" "191.4" "160.8" "216.3" "155.0" "103.4" "82.16" "477.3" "32.02"
## [73] "270.4" "198.3" "126.6" "149.2" "201.0" "143.5" "175.7" "128.2" "104.4"
## [82] "15.09" "31.74" "154.5" "56.11" "264.6" "48.02"

length(gross_data) # cek jumlah data gross_data, karena tidak semua film memuat gross data

## [1] 87

# Missing data diganti dengan nilai NA
for (i in c(18,31,35,37,72,76,83,86,87,90,91,96,100)){
  x<-gross_data[1:(i-1)]
  y<-gross_data[i:length(gross_data)]
  gross_data<-append(x,list("NA"))
  gross_data<-append(gross_data,y)
}

gross_data<- as.numeric(unlist(gross_data)) # data gross dikonversi menjadi numerik

## Warning: NAs introduced by coercion

gross_data

##   [1] 223.80 380.80 293.00 289.90 415.00  10.06 217.50 261.40 206.40 356.40
##  [11] 267.60 400.70 251.50 190.20 341.20 209.70 176.30     NA 215.40 436.40
##  [21] 222.50 200.80 189.40 244.00 237.20 193.60 368.00   4.71 268.40 257.70
##  [31]     NA 177.00 183.10 248.70     NA 320.70     NA 608.50 165.20 195.30
##  [41] 486.30  53.36 123.40 148.40 148.30   5.02 196.50 434.00 543.60 336.00
##  [51]  77.59 124.80  21.00 143.60 180.00  75.29 187.10 114.00 161.30 183.30
##  [61] 238.30 368.30  89.30 127.80  97.69 106.80 145.70 198.00 191.40 160.80
##  [71] 216.30     NA 155.00 103.40  82.16     NA 477.30  32.02 270.40 198.30
##  [81] 126.60 149.20     NA 201.00 143.50     NA     NA 175.70 128.20     NA
##  [91]     NA 104.40  15.09  31.74 154.50     NA  56.11 264.60  48.02     NA
## [101]  48.02

length(gross_data)

## [1] 101

gross_data <- gross_data[-101] #mengeluarkan data ke 101
gross_data

##   [1] 223.80 380.80 293.00 289.90 415.00  10.06 217.50 261.40 206.40 356.40
##  [11] 267.60 400.70 251.50 190.20 341.20 209.70 176.30     NA 215.40 436.40
##  [21] 222.50 200.80 189.40 244.00 237.20 193.60 368.00   4.71 268.40 257.70
##  [31]     NA 177.00 183.10 248.70     NA 320.70     NA 608.50 165.20 195.30
##  [41] 486.30  53.36 123.40 148.40 148.30   5.02 196.50 434.00 543.60 336.00
##  [51]  77.59 124.80  21.00 143.60 180.00  75.29 187.10 114.00 161.30 183.30
##  [61] 238.30 368.30  89.30 127.80  97.69 106.80 145.70 198.00 191.40 160.80
##  [71] 216.30     NA 155.00 103.40  82.16     NA 477.30  32.02 270.40 198.30
##  [81] 126.60 149.20     NA 201.00 143.50     NA     NA 175.70 128.20     NA
##  [91]     NA 104.40  15.09  31.74 154.50     NA  56.11 264.60  48.02     NA

length(gross_data)

## [1] 100

summary(gross_data)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    4.71  127.20  190.20  204.93  259.55  608.50      13

data_lengkap_film <-data.frame(Runtime = runtime_data, Genre = genre_data, Rating = rating_data,Gross_Pendapatan = gross_data)
str(data_lengkap_film)

## 'data.frame':    100 obs. of  4 variables:
##  $ Runtime         : num  98 100 96 92 103 125 98 115 111 95 ...
##  $ Genre           : Factor w/ 6 levels "Action","Adventure",..: 2 2 2 2 2 2 1 1 2 2 ...
##  $ Rating          : num  8.4 8.2 8.3 8.1 8.3 8.6 8.1 8 8.1 8.2 ...
##  $ Gross_Pendapatan: num  224 381 293 290 415 ...

length(data_lengkap_film)

## [1] 4

summary(data_lengkap_film)

##     Runtime             Genre        Rating      Gross_Pendapatan
##  Min.   : 15.00   Action   :24   Min.   :6.000   Min.   :  4.71  
##  1st Qu.: 88.75   Adventure:65   1st Qu.:6.900   1st Qu.:127.20  
##  Median : 95.00   Comedy   : 5   Median :7.300   Median :190.20  
##  Mean   : 89.39   Crime    : 1   Mean   :7.460   Mean   :204.93  
##  3rd Qu.:102.00   Drama    : 4   3rd Qu.:8.025   3rd Qu.:259.55  
##  Max.   :125.00   Short    : 1   Max.   :9.400   Max.   :608.50  
##                                                  NA's   :13

qplot(data = data_lengkap_film,Runtime,fill = Genre,bins = 30)

ggplot(data_lengkap_film,aes(x=Runtime,y=Gross_Pendapatan))+ geom_point(aes(size=Rating,col=Genre))

## Warning: Removed 13 rows containing missing values (geom_point).

Tugas IoT - Web Scrapping

Yusraini Nurul Asra

4/24/2022