train <- read.csv(file=file.choose(), header=TRUE)
train1 <- read.csv(file=file.choose(), header=TRUE)
data <- train
attach(data)

1. Setup

A. 겹치는 자료 제외

sum(duplicated(movie_title)|duplicated(movie_title,fromLast=TRUE))
## [1] 239
length(unique(movie_title[duplicated(movie_title)|duplicated(movie_title,fromLast=TRUE)]))
## [1] 116
movie_title[duplicated(movie_title)|duplicated(movie_title,fromLast=TRUE)]
##   [1] Spider-Man 3<U+00A0>                             
##   [2] The Avengers<U+00A0>                             
##   [3] King Kong<U+00A0>                                
##   [4] Skyfall<U+00A0>                                  
##   [5] Alice in Wonderland<U+00A0>                      
##   [6] Oz the Great and Powerful<U+00A0>                
##   [7] TRON: Legacy<U+00A0>                             
##   [8] The Great Gatsby<U+00A0>                         
##   [9] The Legend of Tarzan<U+00A0>                     
##  [10] The Jungle Book<U+00A0>                          
##  [11] The Lovers<U+00A0>                               
##  [12] Godzilla Resurgence<U+00A0>                      
##  [13] The Fast and the Furious<U+00A0>                 
##  [14] The Legend of Tarzan<U+00A0>                     
##  [15] Pan<U+00A0>                                      
##  [16] Ghostbusters<U+00A0>                             
##  [17] Exodus: Gods and Kings<U+00A0>                   
##  [18] The Twilight Saga: Breaking Dawn - Part 2<U+00A0>
##  [19] The Twilight Saga: Breaking Dawn - Part 2<U+00A0>
##  [20] Home<U+00A0>                                     
##  [21] Godzilla Resurgence<U+00A0>                      
##  [22] Clash of the Titans<U+00A0>                      
##  [23] Total Recall<U+00A0>                             
##  [24] RoboCop<U+00A0>                                  
##  [25] Teenage Mutant Ninja Turtles<U+00A0>             
##  [26] Around the World in 80 Days<U+00A0>              
##  [27] The Island<U+00A0>                               
##  [28] Casino Royale<U+00A0>                            
##  [29] Planet of the Apes<U+00A0>                       
##  [30] Pan<U+00A0>                                      
##  [31] The Tourist<U+00A0>                              
##  [32] Hercules<U+00A0>                                 
##  [33] Point Break<U+00A0>                              
##  [34] Cinderella<U+00A0>                               
##  [35] The Lovely Bones<U+00A0>                         
##  [36] The Alamo<U+00A0>                                
##  [37] Ben-Hur<U+00A0>                                  
##  [38] Conan the Barbarian<U+00A0>                      
##  [39] The Fast and the Furious<U+00A0>                 
##  [40] Dredd<U+00A0>                                    
##  [41] Creepshow<U+00A0>                                
##  [42] The Day the Earth Stood Still<U+00A0>            
##  [43] Hercules<U+00A0>                                 
##  [44] Total Recall<U+00A0>                             
##  [45] Unbroken<U+00A0>                                 
##  [46] Jack Reacher<U+00A0>                             
##  [47] Mercury Rising<U+00A0>                           
##  [48] The Avengers<U+00A0>                             
##  [49] Goosebumps<U+00A0>                               
##  [50] The Watch<U+00A0>                                
##  [51] Lolita<U+00A0>                                   
##  [52] Syriana<U+00A0>                                  
##  [53] Murder by Numbers<U+00A0>                        
##  [54] The Host<U+00A0>                                 
##  [55] First Blood<U+00A0>                              
##  [56] From Hell<U+00A0>                                
##  [57] Across the Universe<U+00A0>                      
##  [58] Dredd<U+00A0>                                    
##  [59] Victor Frankenstein<U+00A0>                      
##  [60] Hero<U+00A0>                                     
##  [61] The Karate Kid<U+00A0>                           
##  [62] Unbroken<U+00A0>                                 
##  [63] Unknown<U+00A0>                                  
##  [64] Victor Frankenstein<U+00A0>                      
##  [65] Disturbia<U+00A0>                                
##  [66] The Fast and the Furious<U+00A0>                 
##  [67] Precious<U+00A0>                                 
##  [68] Twilight<U+00A0>                                 
##  [69] Aloha<U+00A0>                                    
##  [70] A Nightmare on Elm Street<U+00A0>                
##  [71] Poltergeist<U+00A0>                              
##  [72] From Hell<U+00A0>                                
##  [73] House of Wax<U+00A0>                             
##  [74] Heist<U+00A0>                                    
##  [75] Sabotage<U+00A0>                                 
##  [76] The Lovers<U+00A0>                               
##  [77] Snakes on a Plane<U+00A0>                        
##  [78] Ghostbusters<U+00A0>                             
##  [79] Dodgeball: A True Underdog Story<U+00A0>         
##  [80] Carrie<U+00A0>                                   
##  [81] Side Effects<U+00A0>                             
##  [82] Wicker Park<U+00A0>                              
##  [83] Chasing Liberty<U+00A0>                          
##  [84] Glory<U+00A0>                                    
##  [85] Dawn of the Dead<U+00A0>                         
##  [86] The Jungle Book<U+00A0>                          
##  [87] Lucky Number Slevin<U+00A0>                      
##  [88] Brothers<U+00A0>                                 
##  [89] The Omen<U+00A0>                                 
##  [90] The Gambler<U+00A0>                              
##  [91] Eddie the Eagle<U+00A0>                          
##  [92] My Soul to Take<U+00A0>                          
##  [93] The Possession<U+00A0>                           
##  [94] Snakes on a Plane<U+00A0>                        
##  [95] Dangerous Liaisons<U+00A0>                       
##  [96] Point Break<U+00A0>                              
##  [97] Footloose<U+00A0>                                
##  [98] King Kong<U+00A0>                                
##  [99] Eddie the Eagle<U+00A0>                          
## [100] Chasing Liberty<U+00A0>                          
## [101] Forsaken<U+00A0>                                 
## [102] Victor Frankenstein<U+00A0>                      
## [103] The Island<U+00A0>                               
## [104] Death at a Funeral<U+00A0>                       
## [105] Disturbia<U+00A0>                                
## [106] Wicker Park<U+00A0>                              
## [107] The French Connection<U+00A0>                    
## [108] Bad Moms<U+00A0>                                 
## [109] Conan the Barbarian<U+00A0>                      
## [110] Twilight<U+00A0>                                 
## [111] Death at a Funeral<U+00A0>                       
## [112] Left Behind<U+00A0>                              
## [113] Glory<U+00A0>                                    
## [114] The Fog<U+00A0>                                  
## [115] Hamlet<U+00A0>                                   
## [116] Day of the Dead<U+00A0>                          
## [117] Forsaken<U+00A0>                                 
## [118] Halloween<U+00A0>                                
## [119] Hero<U+00A0>                                     
## [120] TRON: Legacy<U+00A0>                             
## [121] The Illusionist<U+00A0>                          
## [122] The Illusionist<U+00A0>                          
## [123] The Unborn<U+00A0>                               
## [124] Bad Moms<U+00A0>                                 
## [125] Left Behind<U+00A0>                              
## [126] Trance<U+00A0>                                   
## [127] Ben-Hur<U+00A0>                                  
## [128] Halloween<U+00A0>                                
## [129] Big Fat Liar<U+00A0>                             
## [130] Snitch<U+00A0>                                   
## [131] Aloha<U+00A0>                                    
## [132] Halloween II<U+00A0>                             
## [133] The Last House on the Left<U+00A0>               
## [134] Clash of the Titans<U+00A0>                      
## [135] The Love Letter<U+00A0>                          
## [136] The Possession<U+00A0>                           
## [137] First Blood<U+00A0>                              
## [138] Dangerous Liaisons<U+00A0>                       
## [139] Big Fat Liar<U+00A0>                             
## [140] Teenage Mutant Ninja Turtles<U+00A0>             
## [141] Dekalog<U+00A0>                                  
## [142] RoboCop<U+00A0>                                  
## [143] Brothers<U+00A0>                                 
## [144] The Claim<U+00A0>                                
## [145] Cat People<U+00A0>                               
## [146] Crossroads<U+00A0>                               
## [147] Casino Royale<U+00A0>                            
## [148] The Alamo<U+00A0>                                
## [149] The Host<U+00A0>                                 
## [150] A Woman, a Gun and a Noodle Shop<U+00A0>         
## [151] Home<U+00A0>                                     
## [152] Snatch<U+00A0>                                   
## [153] History of the World: Part I<U+00A0>             
## [154] O<U+00A0>                                        
## [155] Poltergeist<U+00A0>                              
## [156] Precious<U+00A0>                                 
## [157] Snatch<U+00A0>                                   
## [158] The Gift<U+00A0>                                 
## [159] The Tourist<U+00A0>                              
## [160] Crash<U+00A0>                                    
## [161] Dekalog<U+00A0>                                  
## [162] The Texas Chain Saw Massacre<U+00A0>             
## [163] Heist<U+00A0>                                    
## [164] Footloose<U+00A0>                                
## [165] The Karate Kid<U+00A0>                           
## [166] Creepshow<U+00A0>                                
## [167] Syriana<U+00A0>                                  
## [168] Crash<U+00A0>                                    
## [169] Spider-Man 3<U+00A0>                             
## [170] Juno<U+00A0>                                     
## [171] The Great Gatsby<U+00A0>                         
## [172] The Claim<U+00A0>                                
## [173] Skyfall<U+00A0>                                  
## [174] The Return of the Living Dead<U+00A0>            
## [175] Around the World in 80 Days<U+00A0>              
## [176] Murder by Numbers<U+00A0>                        
## [177] The Gift<U+00A0>                                 
## [178] 20,000 Leagues Under the Sea<U+00A0>             
## [179] O<U+00A0>                                        
## [180] Out of the Blue<U+00A0>                          
## [181] Saving Grace<U+00A0>                             
## [182] Out of the Blue<U+00A0>                          
## [183] Pan<U+00A0>                                      
## [184] Night of the Living Dead<U+00A0>                 
## [185] Exodus: Gods and Kings<U+00A0>                   
## [186] The Return of the Living Dead<U+00A0>            
## [187] Saving Grace<U+00A0>                             
## [188] Planet of the Apes<U+00A0>                       
## [189] My Soul to Take<U+00A0>                          
## [190] Ben-Hur<U+00A0>                                  
## [191] Unknown<U+00A0>                                  
## [192] The Full Monty<U+00A0>                           
## [193] Day of the Dead<U+00A0>                          
## [194] The Watch<U+00A0>                                
## [195] The Gambler<U+00A0>                              
## [196] Alice in Wonderland<U+00A0>                      
## [197] Cinderella<U+00A0>                               
## [198] The Omen<U+00A0>                                 
## [199] Halloween II<U+00A0>                             
## [200] Dodgeball: A True Underdog Story<U+00A0>         
## [201] The Calling<U+00A0>                              
## [202] The French Connection<U+00A0>                    
## [203] Lolita<U+00A0>                                   
## [204] Hamlet<U+00A0>                                   
## [205] Carrie<U+00A0>                                   
## [206] A Nightmare on Elm Street<U+00A0>                
## [207] Dawn of the Dead<U+00A0>                         
## [208] A Woman, a Gun and a Noodle Shop<U+00A0>         
## [209] Jack Reacher<U+00A0>                             
## [210] Mercury Rising<U+00A0>                           
## [211] The Day the Earth Stood Still<U+00A0>            
## [212] Lucky Number Slevin<U+00A0>                      
## [213] The Fog<U+00A0>                                  
## [214] Juno<U+00A0>                                     
## [215] The Full Monty<U+00A0>                           
## [216] Goosebumps<U+00A0>                               
## [217] History of the World: Part I<U+00A0>             
## [218] The Lovely Bones<U+00A0>                         
## [219] Trance<U+00A0>                                   
## [220] Snitch<U+00A0>                                   
## [221] The Unborn<U+00A0>                               
## [222] King Kong<U+00A0>                                
## [223] House of Wax<U+00A0>                             
## [224] Home<U+00A0>                                     
## [225] Crossroads<U+00A0>                               
## [226] Oz the Great and Powerful<U+00A0>                
## [227] Halloween<U+00A0>                                
## [228] Across the Universe<U+00A0>                      
## [229] The Love Letter<U+00A0>                          
## [230] 20,000 Leagues Under the Sea<U+00A0>             
## [231] Side Effects<U+00A0>                             
## [232] The Calling<U+00A0>                              
## [233] The Texas Chain Saw Massacre<U+00A0>             
## [234] Cat People<U+00A0>                               
## [235] A Dog's Breakfast<U+00A0>                        
## [236] A Dog's Breakfast<U+00A0>                        
## [237] Night of the Living Dead<U+00A0>                 
## [238] The Last House on the Left<U+00A0>               
## [239] Sabotage<U+00A0>                                 
## 4818 Levels: #Horror<U+00A0> [Rec] 2<U+00A0> ... Zulu<U+00A0>
which(duplicated(movie_title)|duplicated(movie_title,fromLast=TRUE))
##   [1]    7   18   25   30   33   38   40   50   63   78   83   96   98  134
##  [15]  142  147  156  171  184  185  200  209  210  226  240  263  272  278
##  [29]  279  294  296  305  309  326  327  338  357  378  383  416  418  456
##  [43]  578  642  649  723  769  778  799  854  873  952  979  982 1039 1075
##  [57] 1092 1101 1125 1149 1162 1198 1209 1280 1294 1307 1332 1352 1362 1393
##  [71] 1411 1422 1424 1436 1465 1480 1502 1553 1566 1631 1637 1666 1674 1682
##  [85] 1728 1773 1780 1820 1861 1906 1910 1928 1949 1959 1981 1985 1990 2011
##  [99] 2021 2025 2035 2061 2067 2096 2127 2130 2140 2141 2153 2222 2251 2356
## [113] 2374 2376 2410 2423 2426 2446 2447 2453 2477 2487 2500 2516 2522 2538
## [127] 2564 2570 2579 2583 2590 2596 2598 2601 2660 2720 2721 2726 2747 2769
## [141] 2773 2785 2831 2836 2851 2884 2893 2920 2937 2955 2958 2965 2989 3011
## [155] 3016 3042 3062 3102 3114 3137 3150 3221 3258 3286 3292 3310 3322 3391
## [169] 3400 3402 3415 3419 3432 3515 3523 3586 3635 3641 3658 3718 3727 3767
## [183] 3804 3812 3816 3821 3825 3833 3840 3892 3906 3908 3925 3933 4039 4049
## [197] 4061 4070 4100 4120 4140 4143 4173 4199 4267 4269 4316 4323 4358 4377
## [211] 4400 4404 4442 4465 4473 4477 4481 4492 4538 4555 4587 4600 4601 4656
## [225] 4672 4681 4723 4744 4784 4796 4807 4829 4837 4843 4850 4851 4852 4871
## [239] 4911
  • 영화 상영시간으로 중복되는 자료를 제거할 것이다(만약 상영시간, 감독 이름, 발매 년도가 다르면 제목만 같고 다른 영화)
too.much.work1 <- c()
too.much.work2 <- c()
too.much.work3 <- c()

for(i in 1:sum(duplicated(movie_title)|duplicated(movie_title,fromLast=TRUE))){
if(length(unique(director_name[movie_title==movie_title[duplicated(movie_title)|duplicated(movie_title,fromLast=TRUE)][i]]))==1) too.much.work1[i] <- TRUE else too.much.work1[i]  <- FALSE
if(length(unique(content_rating[movie_title==movie_title[duplicated(movie_title)|duplicated(movie_title,fromLast=TRUE)][i]]))==1) too.much.work2[i] <- TRUE else too.much.work2[i]  <- FALSE
if(length(unique(title_year[movie_title==movie_title[duplicated(movie_title)|duplicated(movie_title,fromLast=TRUE)][i]]))==1) too.much.work3[i] <- TRUE else too.much.work3[i]  <- FALSE
}

sum(too.much.work1)
## [1] 235
sum(too.much.work2)
## [1] 235
sum(too.much.work3)
## [1] 235
which(!too.much.work1==TRUE)
## [1]  54 149 180 182
which(!too.much.work2==TRUE)
## [1]  54 149 180 182
which(!too.much.work3==TRUE)
## [1]  54 149 180 182
movie_title[duplicated(movie_title)|duplicated(movie_title,fromLast=TRUE)][c(54,149,180,182)]
## [1] The Host<U+00A0>        The Host<U+00A0>        Out of the Blue<U+00A0>
## [4] Out of the Blue<U+00A0>
## 4818 Levels: #Horror<U+00A0> [Rec] 2<U+00A0> ... Zulu<U+00A0>
director_name[movie_title==movie_title[duplicated(movie_title)|duplicated(movie_title,fromLast=TRUE)][c(54,149,180,182)]] #different director name
## Warning in is.na(e1) | is.na(e2): longer object length is not a multiple of
## shorter object length
## Warning in `==.default`(movie_title, movie_title[duplicated(movie_title)
## | : longer object length is not a multiple of shorter object length
## [1] Andrew Niccol  Joon-ho Bong   Robert Sarkies
## 2370 Levels:  A. Raven Cruz Aaron Hann Aaron Schneider ... Zoran Lisinac
  • 영화 제목이 같았던 123개의 자료 중 2 개는 제목만 같고 다른 영화였음(상영시간이 충분히 다름)
dupli.title <- movie_title[duplicated(movie_title)|duplicated(movie_title,fromLast=TRUE)][-c(54,149,180,182)]
unique.dupli<-unique(dupli.title)
length(unique(dupli.title))
## [1] 114
remain.dupli<-vector(length=length(unique(dupli.title)))
for(i in 1:length(unique(dupli.title))){
remain.dupli[i]<-which(movie_title==unique.dupli[i])
}
row.dupli<-which(duplicated(movie_title)|duplicated(movie_title,fromLast=TRUE))[-remain.dupli]

data<-train[-(row.dupli),]
write.csv(data, "movie_data.csv")
attach(data)
## The following objects are masked from data (pos = 3):
## 
##     actor_1_facebook_likes, actor_1_name, actor_2_facebook_likes,
##     actor_2_name, actor_3_facebook_likes, actor_3_name,
##     aspect_ratio, budget, cast_total_facebook_likes, color,
##     content_rating, country, director_facebook_likes,
##     director_name, duration, facenumber_in_poster, genres, gross,
##     imdb_score, language, movie_facebook_likes, movie_imdb_link,
##     movie_title, num_critic_for_reviews, num_user_for_reviews,
##     num_voted_users, plot_keywords, title_year, X
  • row.dupli : 중복된 자료

B. NA 개수

sum(is.na(gross))
## [1] 821
sum(is.na(budget))
## [1] 461
sum(is.na(director_name)) #
## [1] 0

2. X Variables

A. 다중공선성 판단

attach(data)
## The following objects are masked from data (pos = 3):
## 
##     actor_1_facebook_likes, actor_1_name, actor_2_facebook_likes,
##     actor_2_name, actor_3_facebook_likes, actor_3_name,
##     aspect_ratio, budget, cast_total_facebook_likes, color,
##     content_rating, country, director_facebook_likes,
##     director_name, duration, facenumber_in_poster, genres, gross,
##     imdb_score, language, movie_facebook_likes, movie_imdb_link,
##     movie_title, num_critic_for_reviews, num_user_for_reviews,
##     num_voted_users, plot_keywords, title_year, X
## The following objects are masked from data (pos = 4):
## 
##     actor_1_facebook_likes, actor_1_name, actor_2_facebook_likes,
##     actor_2_name, actor_3_facebook_likes, actor_3_name,
##     aspect_ratio, budget, cast_total_facebook_likes, color,
##     content_rating, country, director_facebook_likes,
##     director_name, duration, facenumber_in_poster, genres, gross,
##     imdb_score, language, movie_facebook_likes, movie_imdb_link,
##     movie_title, num_critic_for_reviews, num_user_for_reviews,
##     num_voted_users, plot_keywords, title_year, X
numeric.col<-c(4,5,6,7,9,14,15,17,20,24,26,27,29)
library(corrplot)
## corrplot 0.84 loaded
corr<-cor(data[,numeric.col])
corrplot.mixed(corr, number.cex=0.8)

corr.Y <- cor(data[,"imdb_score"],data[,numeric.col[-12]])
rownames(corr.Y) <- c("imdb_score")
corr.Y
##            num_critic_for_reviews duration director_facebook_likes
## imdb_score                     NA       NA                      NA
##            actor_3_facebook_likes actor_1_facebook_likes num_voted_users
## imdb_score                     NA                     NA       0.4134778
##            cast_total_facebook_likes facenumber_in_poster
## imdb_score                0.08629341                   NA
##            num_user_for_reviews budget actor_2_facebook_likes
## imdb_score                   NA     NA                     NA
##            movie_facebook_likes
## imdb_score            0.2539803
library(car)
facebook.likes<-data[,c(27, 9, 26, 7, 6, 15)]
lm.fit.facebook<-lm(facebook.likes$imdb_score~., data=facebook.likes)
vif(lm.fit.facebook)
##    actor_1_facebook_likes    actor_2_facebook_likes 
##                247.807164                 19.208739 
##    actor_3_facebook_likes   director_facebook_likes 
##                  7.579070                  1.021186 
## cast_total_facebook_likes 
##                351.797720
  • 다중공선성이 의심되는 변수
    • Facebook likes : actor1(9), actor2(26), actor3(7), director(6), cast total(15)
    • Old movies : color(2), title_year(25), aspect_ratio(28)
    • review, rating : num_critic_for_reviews(4), num_voted_users(14), num_user_for_reviews(20)
old.movies<-data[,c("imdb_score", "color", "title_year","aspect_ratio")]
lm.fit.old<-lm(old.movies$imdb_score~., data=old.movies)
vif(lm.fit.old)
##                  GVIF Df GVIF^(1/(2*Df))
## color        1.106442  2        1.025610
## title_year   1.129253  1        1.062663
## aspect_ratio 1.025579  1        1.012709
  • Color, aspect_ratio가 범주형 자료라서 위의 결과가 나오는 것으로 추측(확실하지 않음). VIF가 크지는 않음
review.rating<-data[,c("imdb_score", "num_critic_for_reviews", "num_voted_users","num_user_for_reviews")]
lm.fit.rr<-lm(review.rating$imdb_score~., data=review.rating)
vif(lm.fit.rr)
## num_critic_for_reviews        num_voted_users   num_user_for_reviews 
##               1.710255               3.069690               2.962483
  • VIF가 2를 넘어가는 것으로 보아 R^2가 0.5를 넘어갔던 것으로 보임 -> num_voted_users와 num_user_for_reviews의 다중공선성이 크다
facebook.likes1<-data[,c(27,9,26,7,6)]
lm.fit.facebook1<-lm(facebook.likes1$imdb_score~., data=facebook.likes1)
vif(lm.fit.facebook1)
##  actor_1_facebook_likes  actor_2_facebook_likes  actor_3_facebook_likes 
##                1.173507                1.560694                1.424685 
## director_facebook_likes 
##                1.020927
  • [제안] cast_total_facebook_likes를 제거함에 따라 actor_1_facebook_likes의 VIF 도 감소함 -> 다중공선성 해결
review.rating1<-data[,c("imdb_score", "num_critic_for_reviews", "num_voted_users")]
lm.fit.rr1<-lm(review.rating1$imdb_score~., data=review.rating1)
vif(lm.fit.rr1)
## num_critic_for_reviews        num_voted_users 
##               1.625615               1.625615
  • VIF : cast_total_facebook_likes, num_user_for_reviews를 제거하면 다중 공선성을 해결할 수 있다 => 제거할 지 안할지는 더 논의가 필요ㅎ

B. X 변수 범주로 나누기 (Clustering)

0) 단위 통일
par(mfrow=c(1,2))
plot(budget)
plot(budget[!country=="USA"], col=2)

cor(imdb_score, log(budget))
## [1] NA
cor(imdb_score[country=="UK"],log(budget[country=="UK"]))
## [1] NA
cor(imdb_score[country=="France"],log(budget[country=="France"]))
## [1] NA
cor(imdb_score[country=="Canada"],log(budget[country=="Canada"]))
## [1] NA
cor(imdb_score[country=="Germany"],log(budget[country=="Germany"]))
## [1] NA
cor(imdb_score[country=="China"],log(budget[country=="China"]))
## [1] NA
  • Top 2 ~ 6 까지의 나라별 imdb_score과 log(budget) 사이의 상관관계를 조사해봤다
    • France는 0.02로 imdb_score과 log(budget) 사이의 상관관계인 -0.05보다도 상관관계가 줄어들었는데 Canada의 경우 상관관계가 더 커졌다 (-0.05 -> -0.3)
    • 나라별로 budget과 imdb_score 사이의 상관관계가 다를 수도 있음
  • Outliers; 화폐 단위가 다른 나라의 영화들이 budget outlier를 다차지함
  • [제안] : 사실, budget 과 imdb_score의 상관관계가 0.028로 크지 않다. 환율을 고려하는게 귀찮으면(솔직히 다른거 할거 많은데 이거 환율 일일이 찾을 시간이 어딨누) outlier그냥 지워버려도 되지만, 위의 경우 Canada 처럼 budget과 imdb_score의 상관관계가 나라마다 다르므로 budget을 나라별로 구별해서
1) 옛날 영화
plot(title_year, imdb_score)
abline(v=1979, col=2, lty=2)
abline(h=5, col=2, lty=2)

  • 1979년을 기준으로 그 이전의 영화들은 평점이 일정 수준 (5) 이상
2) 저예산 영화
usa.budget<-budget[country=="USA"]
plot(log(usa.budget), imdb_score[country=="USA"])
abline(v=14, col=2, lty=2)
abline(h=4, col=2, lty=2)

exp(14)
## [1] 1202604
  • [제안] 제작비 log(14)($1,202,604 ) 이하와 이상을 구별

C. X 변수 변환

1) Facebook likes
lm.fit.dir<-lm(imdb_score ~ director_facebook_likes)
lm.fit.tot<-lm(imdb_score ~ cast_total_facebook_likes)

par(mfrow=c(2,2))
plot(lm.fit.dir) #not bad

par(mfrow=c(2,2))
plot(lm.fit.tot)  #1791

bind<-cbind(data[,"actor_1_facebook_likes"],
            data[,"actor_2_facebook_likes"],
            data[,"actor_3_facebook_likes"]
)
facebook.likes.median <- apply(bind,1, median)
facebook.likes.mean <- apply(bind,1, mean)

lm.fit.median <- lm(imdb_score ~ facebook.likes.median)
lm.fit.mean <- lm(imdb_score ~ facebook.likes.mean)

par(mfrow=c(2,2))
plot(lm.fit.median) #1147

par(mfrow=c(2,2))
plot(lm.fit.mean)   #1791

  • log 변환하기 전 , 1147, 1791번째 outlier 때문에 정확한 잔차들의 그림을 볼 수 없다
  • 데이터를 확인한 결과 1147, 1791번째 자료가 유명한 배우라서 facebook likes가 정말 많은게 아니라, 그냥 그 두명만 facebook likes가 비정상적으로 많았다는 것을 확인할 수 있다. 따라서 이 두 outlier는 facebook likes의 median 값으로 대체한ㄷ
bind<-cbind(data[,"actor_1_facebook_likes"],
            data[,"actor_2_facebook_likes"],
            data[,"actor_3_facebook_likes"]
)
facebook.likes.median <- apply(bind,1, median)
facebook.likes.mean <- apply(bind,1, mean)

median.lm.fit<-lm(facebook.likes.median~cast_total_facebook_likes)
mean.lm.fit<-lm(facebook.likes.mean~cast_total_facebook_likes)

facebook.likes.mean[1791]<-coef(mean.lm.fit)[2]*train[1791,"cast_total_facebook_likes"]
facebook.likes.median[1147]<-median(facebook.likes.median)

lm.fit.median <- lm(imdb_score ~ facebook.likes.median)
lm.fit.mean <- lm(imdb_score ~ facebook.likes.mean)

par(mfrow=c(2,2))
plot(lm.fit.median) #1147

par(mfrow=c(2,2))
plot(lm.fit.mean)   #1791

data<-cbind(data, facebook.likes.median)

(cor(imdb_score, director_facebook_likes))
## [1] NA
(cor(imdb_score, actor_1_facebook_likes))
## [1] NA
(cor(imdb_score, actor_2_facebook_likes))
## [1] NA
(cor(imdb_score, actor_3_facebook_likes))
## [1] NA
(cor(imdb_score, facebook.likes.median))
## [1] NA
(cor(imdb_score, facebook.likes.mean))
## [1] NA

다 * 감독, 배우 1,2,3 facebook likes의 median을 취하는 새로운 변수 facebook.likes.median, 평균을 취한 facebook.likes.mean을 만들어봄 * outlier를 처리하고 잔차그림을 그려보니 잔차가 0을 중심으로 잘 분포되어 있ㅇ

bind<-cbind(log(data[,"actor_1_facebook_likes"]),
            log(data[,"actor_2_facebook_likes"]),
            log(data[,"actor_3_facebook_likes"])
)

(cor(imdb_score[-which(log(director_facebook_likes)==-Inf)], log(director_facebook_likes[-which(log(director_facebook_likes)==-Inf)])))
## [1] NA
(cor(imdb_score[-which(log(actor_1_facebook_likes)==-Inf)], log(actor_1_facebook_likes[-which(log(actor_1_facebook_likes)==-Inf)])))
## [1] NA
(cor(imdb_score[-which(log(actor_2_facebook_likes)==-Inf)], log(actor_2_facebook_likes[-which(log(actor_2_facebook_likes)==-Inf)])))
## [1] NA
(cor(imdb_score[-which(log(actor_3_facebook_likes)==-Inf)], log(actor_3_facebook_likes[-which(log(actor_3_facebook_likes)==-Inf)])))
## [1] NA
(cor(imdb_score[-which(facebook.likes.median==-Inf)], facebook.likes.median[-which(facebook.likes.median==-Inf)]))
## [1] NA
(cor(imdb_score[-which(facebook.likes.mean==-Inf)], facebook.likes.mean[-which(facebook.likes.mean==-Inf)]))
## [1] NA
  • log를 취한 director_facebook_likes의 Y와 상관관계가 0.34까지 증가
  • log 변환한 facebook Likes의 평균값을 취한 새로운 변수와 Y의 상관계수가 0.17 임
  • [제안] 나머지 facebook likes 변수들은 Y와의 상관계수가 너무 작으므로 log 변환한 director_facebook_likes만 사용하는 것이 바람직해보ㅇ