Assignment - SQL and R

Connet to DB in MySQL and stored matrixs

library("RODBC")
connect <-odbcConnect("Local instance MySQL57")
movies <- sqlQuery(connect, "SELECT * FROM movies.movieRecommend")
df <- as.data.frame(movies)
colnames(df)<- c("No","movId","titles","rate", "release_date","movType","languages","length","audId","audName","gender","age")
df
##    No movId                titles rate release_date        movType
## 1   1     1          Wonder_Woman    4   2017-06-02 ScienceFiction
## 2   2     2 Spider_Man_Homecoming    4   2017-07-07         Action
## 3   3     3                 Logan    3   2017-03-03         Action
## 4   4     4 The_Hitman's_Bodyguar    4   2017-08-18         Action
## 5   5     5       Despicable_Me_3    0   2017-06-04      Animation
## 6   6     6                    IT    3   2017-09-10         Horror
## 7   7     1          Wonder_Woman    4   2017-06-02 ScienceFiction
## 8   8     2 Spider_Man_Homecoming    4   2017-07-07         Action
## 9   9     3                 Logan    4   2017-03-03         Action
## 10 10     4 The_Hitman's_Bodyguar    3   2017-08-18         Action
## 11 11     5       Despicable_Me_3    0   2017-06-04      Animation
## 12 12     6                    IT    3   2017-09-10         Horror
## 13 13     1          Wonder_Woman    5   2017-06-02 ScienceFiction
## 14 14     2 Spider_Man_Homecoming    3   2017-07-07         Action
## 15 15     3                 Logan    3   2017-03-03         Action
## 16 16     4 The_Hitman's_Bodyguar    5   2017-08-18         Action
## 17 17     5       Despicable_Me_3    4   2017-06-04      Animation
## 18 18     6                    IT    0   2017-09-10         Horror
## 19 19     1          Wonder_Woman    4   2017-06-02 ScienceFiction
## 20 20     2 Spider_Man_Homecoming    5   2017-07-07         Action
## 21 21     3                 Logan    0   2017-03-03         Action
## 22 22     4 The_Hitman's_Bodyguar    5   2017-08-18         Action
## 23 23     5       Despicable_Me_3    5   2017-06-04      Animation
## 24 24     6                    IT    0   2017-09-10         Horror
## 25 25     1          Wonder_Woman    5   2017-06-02 ScienceFiction
## 26 26     2 Spider_Man_Homecoming    0   2017-07-07         Action
## 27 27     3                 Logan    5   2017-03-03         Action
## 28 28     4 The_Hitman's_Bodyguar    3   2017-08-18         Action
## 29 29     5       Despicable_Me_3    3   2017-06-04      Animation
## 30 30     6                    IT    4   2017-09-10         Horror
##    languages   length audId      audName gender age
## 1    English 02:21:00     1 PhillipSmith      M  25
## 2    English 02:13:00     1 PhillipSmith      M  25
## 3    English 01:59:00     1 PhillipSmith      M  25
## 4    English 01:58:00     1 PhillipSmith      M  25
## 5    English 01:36:00     1 PhillipSmith      M  25
## 6    English 02:15:00     1 PhillipSmith      M  25
## 7    English 02:21:00     2           JD      M  34
## 8    English 02:13:00     2           JD      M  34
## 9    English 01:59:00     2           JD      M  34
## 10   English 01:58:00     2           JD      M  34
## 11   English 01:36:00     2           JD      M  34
## 12   English 02:15:00     2           JD      M  34
## 13   English 02:21:00     3     JuneWong      F  28
## 14   English 02:13:00     3     JuneWong      F  28
## 15   English 01:59:00     3     JuneWong      F  28
## 16   English 01:58:00     3     JuneWong      F  28
## 17   English 01:36:00     3     JuneWong      F  28
## 18   English 02:15:00     3     JuneWong      F  28
## 19   English 02:21:00     4  SherryZhang      F  38
## 20   English 02:13:00     4  SherryZhang      F  38
## 21   English 01:59:00     4  SherryZhang      F  38
## 22   English 01:58:00     4  SherryZhang      F  38
## 23   English 01:36:00     4  SherryZhang      F  38
## 24   English 02:15:00     4  SherryZhang      F  38
## 25   English 02:21:00     5 ParwolKuzuba      M  22
## 26   English 02:13:00     5 ParwolKuzuba      M  22
## 27   English 01:59:00     5 ParwolKuzuba      M  22
## 28   English 01:58:00     5 ParwolKuzuba      M  22
## 29   English 01:36:00     5 ParwolKuzuba      M  22
## 30   English 02:15:00     5 ParwolKuzuba      M  22
summary(df)
##        No            movId                       titles       rate      
##  Min.   : 1.00   Min.   :1.0   Despicable_Me_3      :5   Min.   :0.000  
##  1st Qu.: 8.25   1st Qu.:2.0   IT                   :5   1st Qu.:3.000  
##  Median :15.50   Median :3.5   Logan                :5   Median :4.000  
##  Mean   :15.50   Mean   :3.5   Spider_Man_Homecoming:5   Mean   :3.167  
##  3rd Qu.:22.75   3rd Qu.:5.0   The_Hitman's_Bodyguar:5   3rd Qu.:4.000  
##  Max.   :30.00   Max.   :6.0   Wonder_Woman         :5   Max.   :5.000  
##   release_date                  movType     languages       length 
##  Min.   :2017-03-03   Action        :15   English:30   01:36:00:5  
##  1st Qu.:2017-06-02   Animation     : 5                01:58:00:5  
##  Median :2017-06-20   Horror        : 5                01:59:00:5  
##  Mean   :2017-06-22   ScienceFiction: 5                02:13:00:5  
##  3rd Qu.:2017-08-18                                    02:15:00:5  
##  Max.   :2017-09-10                                    02:21:00:5  
##      audId           audName  gender      age      
##  Min.   :1   JD          :6   F:12   Min.   :22.0  
##  1st Qu.:2   JuneWong    :6   M:18   1st Qu.:25.0  
##  Median :3   ParwolKuzuba:6          Median :28.0  
##  Mean   :3   PhillipSmith:6          Mean   :29.4  
##  3rd Qu.:4   SherryZhang :6          3rd Qu.:34.0  
##  Max.   :5                           Max.   :38.0

Check data types

str(df)
## 'data.frame':    30 obs. of  12 variables:
##  $ No          : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ movId       : int  1 2 3 4 5 6 1 2 3 4 ...
##  $ titles      : Factor w/ 6 levels "Despicable_Me_3",..: 6 4 3 5 1 2 6 4 3 5 ...
##  $ rate        : int  4 4 3 4 0 3 4 4 4 3 ...
##  $ release_date: Date, format: "2017-06-02" "2017-07-07" ...
##  $ movType     : Factor w/ 4 levels "Action","Animation",..: 4 1 1 1 2 3 4 1 1 1 ...
##  $ languages   : Factor w/ 1 level "English": 1 1 1 1 1 1 1 1 1 1 ...
##  $ length      : Factor w/ 6 levels "01:36:00","01:58:00",..: 6 4 3 2 1 5 6 4 3 2 ...
##  $ audId       : int  1 1 1 1 1 1 2 2 2 2 ...
##  $ audName     : Factor w/ 5 levels "JD","JuneWong",..: 4 4 4 4 4 4 1 1 1 1 ...
##  $ gender      : Factor w/ 2 levels "F","M": 2 2 2 2 2 2 2 2 2 2 ...
##  $ age         : int  25 25 25 25 25 25 34 34 34 34 ...

Correct data types for colunms - No, movId, audId

df$No <- as.factor(df$No)
df$movId <- as.factor(df$movId)
df$audId <- as.factor(df$audId)

str(df)
## 'data.frame':    30 obs. of  12 variables:
##  $ No          : Factor w/ 30 levels "1","2","3","4",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ movId       : Factor w/ 6 levels "1","2","3","4",..: 1 2 3 4 5 6 1 2 3 4 ...
##  $ titles      : Factor w/ 6 levels "Despicable_Me_3",..: 6 4 3 5 1 2 6 4 3 5 ...
##  $ rate        : int  4 4 3 4 0 3 4 4 4 3 ...
##  $ release_date: Date, format: "2017-06-02" "2017-07-07" ...
##  $ movType     : Factor w/ 4 levels "Action","Animation",..: 4 1 1 1 2 3 4 1 1 1 ...
##  $ languages   : Factor w/ 1 level "English": 1 1 1 1 1 1 1 1 1 1 ...
##  $ length      : Factor w/ 6 levels "01:36:00","01:58:00",..: 6 4 3 2 1 5 6 4 3 2 ...
##  $ audId       : Factor w/ 5 levels "1","2","3","4",..: 1 1 1 1 1 1 2 2 2 2 ...
##  $ audName     : Factor w/ 5 levels "JD","JuneWong",..: 4 4 4 4 4 4 1 1 1 1 ...
##  $ gender      : Factor w/ 2 levels "F","M": 2 2 2 2 2 2 2 2 2 2 ...
##  $ age         : int  25 25 25 25 25 25 34 34 34 34 ...

Find missing values in Rate since all scores should be in range 1-5.

1 - no like at all
2 - few like
3 - may like
4 - some like
5 - very like
#required to install "sqldf" package
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
missing <- sqldf("select * from df where df.rate=0 ")

#required to install "htmlTable" package
library(htmlTable)
htmlTable(missing, caption = 'Summary of Missing Values by rate in the df dataset')
Summary of Missing Values by rate in the df dataset
No movId titles rate release_date movType languages length audId audName gender age
1 5 5 Despicable_Me_3 0 2017-06-04 Animation English 01:36:00 1 PhillipSmith M 25
2 11 5 Despicable_Me_3 0 2017-06-04 Animation English 01:36:00 2 JD M 34
3 18 6 IT 0 2017-09-10 Horror English 02:15:00 3 JuneWong F 28
4 21 3 Logan 0 2017-03-03 Action English 01:59:00 4 SherryZhang F 38
5 24 6 IT 0 2017-09-10 Horror English 02:15:00 4 SherryZhang F 38
6 26 2 Spider_Man_Homecoming 0 2017-07-07 Action English 02:13:00 5 ParwolKuzuba M 22

Modifying the missing data - rate = “0” -> NA

df$rate   <- ifelse(df$rate   == 0, NA, df$rate)

summary(df$rate)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   3.000   3.000   4.000   3.958   5.000   5.000       6