library(dplyr)
## Warning: package 'dplyr' was built under R version 3.4.4
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringr)
## Warning: package 'stringr' was built under R version 3.4.4
library(polycor)
## Warning: package 'polycor' was built under R version 3.4.4
library(mice)
## Warning: package 'mice' was built under R version 3.4.4
## Loading required package: lattice
library(reshape2)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.4.4
library(plotly)
## Warning: package 'plotly' was built under R version 3.4.4
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
df <- read.csv("diamonds-project.csv", header = TRUE, stringsAsFactors = F,
      colClasses = c("numeric","character","character","character","numeric","numeric","numeric",
                     "numeric","numeric","numeric"))
print(paste("Total no. of rows are",nrow(df)))
## [1] "Total no. of rows are 53940"
print(paste("Total no. of columns are",ncol(df)))
## [1] "Total no. of columns are 10"
head(df)
##   carat       cut color clarity depth table price    x    y    z
## 1  0.23     Ideal     E     SI2  61.5    55   326 3.95 3.98 2.43
## 2  0.21   Premium     E     SI1  59.8    61   326 3.89 3.84 2.31
## 3  0.23      Good     E     VS1  56.9    65   327 4.05 4.07 2.31
## 4  0.29   Premium     I     VS2  62.4    58   334 4.20 4.23 2.63
## 5  0.31      Good     J     SI2  63.3    58   335 4.34 4.35 2.75
## 6  0.24 Very Good     J    VVS2  62.8    57   336 3.94 3.96 2.48
str(df)
## 'data.frame':    53940 obs. of  10 variables:
##  $ carat  : num  0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
##  $ cut    : chr  "Ideal" "Premium" "Good" "Premium" ...
##  $ color  : chr  "E" "E" "E" "I" ...
##  $ clarity: chr  "SI2" "SI1" "VS1" "VS2" ...
##  $ depth  : num  61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
##  $ table  : num  55 61 65 58 58 57 57 55 61 61 ...
##  $ price  : num  326 326 327 334 335 336 336 337 337 338 ...
##  $ x      : num  3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
##  $ y      : num  3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
##  $ z      : num  2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...
summary(df)
##      carat              cut               color          
##  Min.   :  0.2000   Length:53940       Length:53940      
##  1st Qu.:  0.4000   Class :character   Class :character  
##  Median :  0.7000   Mode  :character   Mode  :character  
##  Mean   :  0.8164                                        
##  3rd Qu.:  1.0400                                        
##  Max.   :999.0000                                        
##  NA's   :1                                               
##    clarity              depth            table            price      
##  Length:53940       Min.   : 43.00   Min.   : 43.00   Min.   :    0  
##  Class :character   1st Qu.: 61.00   1st Qu.: 56.00   1st Qu.:  950  
##  Mode  :character   Median : 61.80   Median : 57.00   Median : 2401  
##                     Mean   : 61.77   Mean   : 57.48   Mean   : 3935  
##                     3rd Qu.: 62.50   3rd Qu.: 59.00   3rd Qu.: 5325  
##                     Max.   :999.00   Max.   :999.00   Max.   :99999  
##                     NA's   :1        NA's   :1        NA's   :1      
##        x                y                z         
##  Min.   : 0.000   Min.   : 0.000   Min.   : 0.000  
##  1st Qu.: 4.710   1st Qu.: 4.720   1st Qu.: 2.910  
##  Median : 5.700   Median : 5.710   Median : 3.530  
##  Mean   : 5.731   Mean   : 5.735   Mean   : 3.539  
##  3rd Qu.: 6.540   3rd Qu.: 6.540   3rd Qu.: 4.040  
##  Max.   :10.740   Max.   :58.900   Max.   :31.800  
## 

check for NAs

# get isNA Count
detectNAs <- function(inp) {
    return(sum(is.na(inp)))
}

# detect NAs
lapply(df, FUN=detectNAs)
## $carat
## [1] 1
## 
## $cut
## [1] 0
## 
## $color
## [1] 0
## 
## $clarity
## [1] 0
## 
## $depth
## [1] 1
## 
## $table
## [1] 1
## 
## $price
## [1] 1
## 
## $x
## [1] 0
## 
## $y
## [1] 0
## 
## $z
## [1] 0
# get detectZeros Count
detectZeros <- function(inp) {
    if (class(inp) != "numeric") {
        return ("Non Numeric Column")
    }
    sum(inp==0)
}
# detect Spaces
lapply(df, FUN=detectZeros)
## $carat
## [1] NA
## 
## $cut
## [1] "Non Numeric Column"
## 
## $color
## [1] "Non Numeric Column"
## 
## $clarity
## [1] "Non Numeric Column"
## 
## $depth
## [1] NA
## 
## $table
## [1] NA
## 
## $price
## [1] NA
## 
## $x
## [1] 8
## 
## $y
## [1] 7
## 
## $z
## [1] 20
### There are zeros in X, Y and Z columns. Here we will convert them into NAs
df$x[df$x == 0] <- NA
df$y[df$y == 0] <- NA
df$z[df$z == 0] <- NA
df$price[df$price == 0] <-NA 
## check for zeros again
lapply(df, detectZeros)
## $carat
## [1] NA
## 
## $cut
## [1] "Non Numeric Column"
## 
## $color
## [1] "Non Numeric Column"
## 
## $clarity
## [1] "Non Numeric Column"
## 
## $depth
## [1] NA
## 
## $table
## [1] NA
## 
## $price
## [1] NA
## 
## $x
## [1] NA
## 
## $y
## [1] NA
## 
## $z
## [1] NA

** check for undefined data in categoric columns**

# get detectSpaces Count
detectSpaces <- function(inp) {
    if (class(inp) != "character") {
        return ("Non Character Column")
    }
    sum(trimws(inp)=="")
}

# detect spaces
lapply(df, FUN=detectSpaces)
## $carat
## [1] "Non Character Column"
## 
## $cut
## [1] 1
## 
## $color
## [1] 1
## 
## $clarity
## [1] 1
## 
## $depth
## [1] "Non Character Column"
## 
## $table
## [1] "Non Character Column"
## 
## $price
## [1] "Non Character Column"
## 
## $x
## [1] "Non Character Column"
## 
## $y
## [1] "Non Character Column"
## 
## $z
## [1] "Non Character Column"
df$cut[trimws(df$cut) == ""] <- NA
df$color[trimws(df$color) == ""] <- NA
df$clarity[trimws(df$clarity) == ""] <- NA

## check again for spaces
lapply(df,detectSpaces)
## $carat
## [1] "Non Character Column"
## 
## $cut
## [1] NA
## 
## $color
## [1] NA
## 
## $clarity
## [1] NA
## 
## $depth
## [1] "Non Character Column"
## 
## $table
## [1] "Non Character Column"
## 
## $price
## [1] "Non Character Column"
## 
## $x
## [1] "Non Character Column"
## 
## $y
## [1] "Non Character Column"
## 
## $z
## [1] "Non Character Column"
# get Outlier Count
detectOutliers <- function(inp, na.rm=TRUE) {
    if (class(inp) != "numeric") {
        return ("Non Numeric Column")
    }
    i.qnt <- quantile(inp, probs=c(.25, .75), na.rm=na.rm)
    i.max <- 1.5 * IQR(inp, na.rm=na.rm)
    otp <- inp
    otp[inp < (i.qnt[1] - i.max)] <- NA
    otp[inp > (i.qnt[2] + i.max)] <- NA
    return(inp[is.na(otp)])
}

# detect Outliers
outliers <- lapply(df, FUN=detectOutliers)

# get outlier count
summary(outliers)
##         Length Class  Mode     
## carat   1891   -none- numeric  
## cut        1   -none- character
## color      1   -none- character
## clarity    1   -none- character
## depth   2547   -none- numeric  
## table    608   -none- numeric  
## price   3541   -none- numeric  
## x         32   -none- numeric  
## y         29   -none- numeric  
## z         49   -none- numeric
### "carat" and "depth" columns have huge number of outliers. we can either delete those rows or 
### we can increase our range of outliers. In our case we will increase ouroutlier range to save 
### the data rows and replace the outliers with NAs
ReplaceOutliers <- function(inp, na.rm=TRUE) {
    if (class(inp) != "numeric") {
        return (inp)
    }
    i.qnt <- quantile(inp, probs=c(.25, .75), na.rm=na.rm)
    i.max <- 3 * IQR(inp, na.rm=na.rm)
    inp[inp < (i.qnt[1] - i.max)] <- NA
    inp[inp > (i.qnt[2] + i.max)] <- NA
    return(inp)
}

# detect Outliers
outliers <- sapply(df, FUN=ReplaceOutliers)
summary(outliers)
##      carat              cut            color          clarity     
##  0.3    : 2604   Fair     : 1610   G      :11292   SI1    :13065  
##  0.31   : 2249   Good     : 4905   E      : 9796   VS2    :12258  
##  1.01   : 2242   Ideal    :21550   F      : 9542   SI2    : 9194  
##  0.7    : 1981   Premium  :13791   H      : 8304   VS1    : 8169  
##  0.32   : 1840   Very Good:12082   D      : 6774   VVS2   : 5066  
##  (Other):42982   Wonderful:    1   (Other): 8231   (Other): 6187  
##  NA's   :   42   NA's     :    1   NA's   :    1   NA's   :    1  
##      depth           table           price             x        
##  62     : 2239   56     :10039   605    :  132   4.37   :  448  
##  61.9   : 2163   57     : 9811   802    :  127   4.34   :  437  
##  61.8   : 2077   58     : 8435   625    :  126   4.33   :  429  
##  62.2   : 2039   59     : 6621   828    :  125   4.38   :  428  
##  62.1   : 2020   55     : 6442   776    :  124   4.32   :  425  
##  (Other):43122   (Other):12562   (Other):53183   (Other):51765  
##  NA's   :  280   NA's   :   30   NA's   :  123   NA's   :    8  
##        y               z        
##  4.34   :  437   2.7    :  767  
##  4.37   :  435   2.69   :  748  
##  4.35   :  425   2.71   :  738  
##  4.33   :  421   2.68   :  730  
##  4.32   :  414   2.72   :  697  
##  (Other):51799   (Other):50238  
##  NA's   :    9   NA's   :   22
# get new outlier count
df <- data.frame(outliers,stringsAsFactors = F)
lapply(df,detectNAs)
## $carat
## [1] 42
## 
## $cut
## [1] 1
## 
## $color
## [1] 1
## 
## $clarity
## [1] 1
## 
## $depth
## [1] 280
## 
## $table
## [1] 30
## 
## $price
## [1] 123
## 
## $x
## [1] 8
## 
## $y
## [1] 9
## 
## $z
## [1] 22
write.csv(x = df,file = "df.csv",row.names = FALSE)

Let’s Impute values for NAs by KNN

df <- read.csv("df.csv",stringsAsFactors = F)

#here we need to convert catrgorical variables into factors
df$cut <- as.factor(df$cut)
levels(df$cut) <-c(1,2,3,4,5,6)

df$color <- as.factor(df$color)
levels(df$color) <- c(11,12,13,14,15,16,17,18)

df$clarity <- as.factor(df$clarity)
levels(df$clarity) <- c(21,22,23,24,25,26,27,28,29)

summary(df)
##      carat          cut            color          clarity     
##  Min.   :0.2000   1   : 1610   15     :11292   23     :13065  
##  1st Qu.:0.4000   2   : 4905   13     : 9796   26     :12258  
##  Median :0.7000   3   :21550   14     : 9542   24     : 9194  
##  Mean   :0.7961   4   :13791   16     : 8304   25     : 8169  
##  3rd Qu.:1.0400   5   :12082   12     : 6774   28     : 5066  
##  Max.   :2.8000   6   :    1   (Other): 8231   (Other): 6187  
##  NA's   :42       NA's:    1   NA's   :    1   NA's   :    1  
##      depth           table           price             x         
##  Min.   :56.50   Min.   :49.00   Min.   :  326   Min.   : 3.730  
##  1st Qu.:61.00   1st Qu.:56.00   1st Qu.:  948   1st Qu.: 4.710  
##  Median :61.80   Median :57.00   Median : 2396   Median : 5.700  
##  Mean   :61.74   Mean   :57.45   Mean   : 3900   Mean   : 5.732  
##  3rd Qu.:62.50   3rd Qu.:59.00   3rd Qu.: 5294   3rd Qu.: 6.540  
##  Max.   :67.00   Max.   :68.00   Max.   :18447   Max.   :10.740  
##  NA's   :280     NA's   :30      NA's   :123     NA's   :8       
##        y                z        
##  Min.   : 3.680   Min.   :1.070  
##  1st Qu.: 4.720   1st Qu.:2.910  
##  Median : 5.710   Median :3.530  
##  Mean   : 5.734   Mean   :3.539  
##  3rd Qu.: 6.540   3rd Qu.:4.040  
##  Max.   :10.540   Max.   :6.980  
##  NA's   :9        NA's   :22
df <- filter(df,!(is.na(df$cut)))
df <- filter(df,!(is.na(df$color)))
df <- filter(df,!(is.na(df$clarity)))
df <- filter(df,!(is.na(df$table)))
df <- filter(df,!(is.na(df$x)))
df <- filter(df,!(is.na(df$y)))
df <- filter(df,!(is.na(df$z)))
summary(df)
##      carat        cut           color          clarity     
##  Min.   :0.2000   1: 1582   15     :11281   23     :13056  
##  1st Qu.:0.4000   2: 4901   13     : 9788   26     :12243  
##  Median :0.7000   3:21545   14     : 9531   24     : 9178  
##  Mean   :0.7958   4:13777   16     : 8289   25     : 8158  
##  3rd Qu.:1.0400   5:12078   12     : 6770   28     : 5066  
##  Max.   :2.8000   6:    1   17     : 5417   27     : 3654  
##  NA's   :42                 (Other): 2808   (Other): 2529  
##      depth           table           price             x         
##  Min.   :56.50   Min.   :49.00   Min.   :  326   Min.   : 3.730  
##  1st Qu.:61.00   1st Qu.:56.00   1st Qu.:  947   1st Qu.: 4.710  
##  Median :61.80   Median :57.00   Median : 2396   Median : 5.700  
##  Mean   :61.74   Mean   :57.45   Mean   : 3898   Mean   : 5.731  
##  3rd Qu.:62.50   3rd Qu.:59.00   3rd Qu.: 5294   3rd Qu.: 6.540  
##  Max.   :67.00   Max.   :68.00   Max.   :18447   Max.   :10.740  
##  NA's   :271                     NA's   :122                     
##        y                z        
##  Min.   : 3.680   Min.   :1.070  
##  1st Qu.: 4.720   1st Qu.:2.910  
##  Median : 5.710   Median :3.530  
##  Mean   : 5.733   Mean   :3.539  
##  3rd Qu.: 6.540   3rd Qu.:4.040  
##  Max.   :10.540   Max.   :6.980  
## 

machine learning 1

stepModel <- step(lm(data = df, price~carat+cut+color+clarity), trace = 0, steps = 100)
summary(stepModel)
## 
## Call:
## lm(formula = price ~ carat + cut + color + clarity, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8730.8  -673.8  -190.8   467.6  9847.9 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -7332.21    1118.22  -6.557 5.54e-11 ***
## carat        8899.42      11.83 752.266  < 2e-16 ***
## cut2          640.05      32.84  19.490  < 2e-16 ***
## cut3          981.36      29.98  32.731  < 2e-16 ***
## cut4          853.61      30.25  28.216  < 2e-16 ***
## cut5          829.53      30.58  27.130  < 2e-16 ***
## cut6         -127.65    1117.54  -0.114  0.90906    
## color12       295.32    1117.23   0.264  0.79152    
## color13        83.64    1117.21   0.075  0.94032    
## color14       -20.69    1117.21  -0.019  0.98523    
## color15      -221.98    1117.21  -0.199  0.84250    
## color16      -689.62    1117.22  -0.617  0.53706    
## color17     -1136.88    1117.27  -1.018  0.30889    
## color18     -2016.99    1117.37  -1.805  0.07106 .  
## clarity22    5091.98      50.90 100.032  < 2e-16 ***
## clarity23    3244.99      43.75  74.169  < 2e-16 ***
## clarity24    2301.07      43.98  52.326  < 2e-16 ***
## clarity25    4216.25      44.62  94.487  < 2e-16 ***
## clarity26    3894.95      43.97  88.577  < 2e-16 ***
## clarity27    4756.23      47.14 100.901  < 2e-16 ***
## clarity28    4650.06      45.89 101.328  < 2e-16 ***
## clarity29    3170.97    1118.02   2.836  0.00457 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1117 on 53704 degrees of freedom
##   (158 observations deleted due to missingness)
## Multiple R-squared:  0.919,  Adjusted R-squared:  0.9189 
## F-statistic: 2.9e+04 on 21 and 53704 DF,  p-value: < 2.2e-16

observation From the above observation we can see that Adj. r-squared is high and F-statistic is also higher than p-value that means price is significantly dependent on “Carat”, “Cut”, “Color” and “Clarity” variables.

## So we can now impute NAs in price variable on forementioned variables.
library(VIM)
## Warning: package 'VIM' was built under R version 3.4.4
## Loading required package: colorspace
## Loading required package: grid
## Loading required package: data.table
## Warning: package 'data.table' was built under R version 3.4.4
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:reshape2':
## 
##     dcast, melt
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## VIM is ready to use. 
##  Since version 4.0.0 the GUI is in its own package VIMGUI.
## 
##           Please use the package to use the new (and old) GUI.
## Suggestions and bug-reports can be submitted at: https://github.com/alexkowa/VIM/issues
## 
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
## 
##     sleep
df1 <- kNN(df,variable = c("price"),dist_var = c("carat", "cut", "color","clarity"))
summary(df1)
##      carat        cut           color          clarity     
##  Min.   :0.2000   1: 1582   15     :11281   23     :13056  
##  1st Qu.:0.4000   2: 4901   13     : 9788   26     :12243  
##  Median :0.7000   3:21545   14     : 9531   24     : 9178  
##  Mean   :0.7958   4:13777   16     : 8289   25     : 8158  
##  3rd Qu.:1.0400   5:12078   12     : 6770   28     : 5066  
##  Max.   :2.8000   6:    1   17     : 5417   27     : 3654  
##  NA's   :42                 (Other): 2808   (Other): 2529  
##      depth           table           price             x         
##  Min.   :56.50   Min.   :49.00   Min.   :  326   Min.   : 3.730  
##  1st Qu.:61.00   1st Qu.:56.00   1st Qu.:  949   1st Qu.: 4.710  
##  Median :61.80   Median :57.00   Median : 2400   Median : 5.700  
##  Mean   :61.74   Mean   :57.45   Mean   : 3919   Mean   : 5.731  
##  3rd Qu.:62.50   3rd Qu.:59.00   3rd Qu.: 5323   3rd Qu.: 6.540  
##  Max.   :67.00   Max.   :68.00   Max.   :18447   Max.   :10.740  
##  NA's   :271                                                     
##        y                z         price_imp      
##  Min.   : 3.680   Min.   :1.070   Mode :logical  
##  1st Qu.: 4.720   1st Qu.:2.910   FALSE:53762    
##  Median : 5.710   Median :3.530   TRUE :122      
##  Mean   : 5.733   Mean   :3.539                  
##  3rd Qu.: 6.540   3rd Qu.:4.040                  
##  Max.   :10.540   Max.   :6.980                  
## 
df <- dplyr::select(df1, carat:z)
summary(df)
##      carat        cut           color          clarity     
##  Min.   :0.2000   1: 1582   15     :11281   23     :13056  
##  1st Qu.:0.4000   2: 4901   13     : 9788   26     :12243  
##  Median :0.7000   3:21545   14     : 9531   24     : 9178  
##  Mean   :0.7958   4:13777   16     : 8289   25     : 8158  
##  3rd Qu.:1.0400   5:12078   12     : 6770   28     : 5066  
##  Max.   :2.8000   6:    1   17     : 5417   27     : 3654  
##  NA's   :42                 (Other): 2808   (Other): 2529  
##      depth           table           price             x         
##  Min.   :56.50   Min.   :49.00   Min.   :  326   Min.   : 3.730  
##  1st Qu.:61.00   1st Qu.:56.00   1st Qu.:  949   1st Qu.: 4.710  
##  Median :61.80   Median :57.00   Median : 2400   Median : 5.700  
##  Mean   :61.74   Mean   :57.45   Mean   : 3919   Mean   : 5.731  
##  3rd Qu.:62.50   3rd Qu.:59.00   3rd Qu.: 5323   3rd Qu.: 6.540  
##  Max.   :67.00   Max.   :68.00   Max.   :18447   Max.   :10.740  
##  NA's   :271                                                     
##        y                z        
##  Min.   : 3.680   Min.   :1.070  
##  1st Qu.: 4.720   1st Qu.:2.910  
##  Median : 5.710   Median :3.530  
##  Mean   : 5.733   Mean   :3.539  
##  3rd Qu.: 6.540   3rd Qu.:4.040  
##  Max.   :10.540   Max.   :6.980  
## 

Impute NAs for other variables

df1 <- kNN(df,variable = c("carat","depth"))
summary(df1)
##      carat        cut           color          clarity     
##  Min.   :0.2000   1: 1582   15     :11281   23     :13056  
##  1st Qu.:0.4000   2: 4901   13     : 9788   26     :12243  
##  Median :0.7000   3:21545   14     : 9531   24     : 9178  
##  Mean   :0.7967   4:13777   16     : 8289   25     : 8158  
##  3rd Qu.:1.0400   5:12078   12     : 6770   28     : 5066  
##  Max.   :2.8000   6:    1   17     : 5417   27     : 3654  
##                             (Other): 2808   (Other): 2529  
##      depth           table           price             x         
##  Min.   :56.50   Min.   :49.00   Min.   :  326   Min.   : 3.730  
##  1st Qu.:61.00   1st Qu.:56.00   1st Qu.:  949   1st Qu.: 4.710  
##  Median :61.80   Median :57.00   Median : 2400   Median : 5.700  
##  Mean   :61.74   Mean   :57.45   Mean   : 3919   Mean   : 5.731  
##  3rd Qu.:62.50   3rd Qu.:59.00   3rd Qu.: 5323   3rd Qu.: 6.540  
##  Max.   :67.00   Max.   :68.00   Max.   :18447   Max.   :10.740  
##                                                                  
##        y                z         carat_imp       depth_imp      
##  Min.   : 3.680   Min.   :1.070   Mode :logical   Mode :logical  
##  1st Qu.: 4.720   1st Qu.:2.910   FALSE:53842     FALSE:53613    
##  Median : 5.710   Median :3.530   TRUE :42        TRUE :271      
##  Mean   : 5.733   Mean   :3.539                                  
##  3rd Qu.: 6.540   3rd Qu.:4.040                                  
##  Max.   :10.540   Max.   :6.980                                  
## 
df <- dplyr::select(df1,-c("carat_imp","depth_imp"))

Machine Learning 2

cut.model <- step(glm(data = df, formula = factor(cut)~price + carat + color+ clarity,
                      family = binomial(link = "logit")), trace = 0,steps = 100)
summary(cut.model)
## 
## Call:
## glm(formula = factor(cut) ~ price + carat + color + clarity, 
##     family = binomial(link = "logit"), data = df)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.9914   0.1497   0.1980   0.2570   2.8243  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  1.359e+01  1.970e+02   0.069 0.944985    
## price        6.425e-04  3.085e-05  20.828  < 2e-16 ***
## carat       -5.465e+00  2.334e-01 -23.422  < 2e-16 ***
## color12     -8.398e+00  1.970e+02  -0.043 0.965992    
## color13     -8.260e+00  1.970e+02  -0.042 0.966552    
## color14     -8.523e+00  1.970e+02  -0.043 0.965485    
## color15     -8.323e+00  1.970e+02  -0.042 0.966294    
## color16     -8.123e+00  1.970e+02  -0.041 0.967103    
## color17     -7.781e+00  1.970e+02  -0.040 0.968489    
## color18     -7.476e+00  1.970e+02  -0.038 0.969723    
## clarity22    1.153e+00  3.724e-01   3.097 0.001956 ** 
## clarity23    3.317e-01  1.407e-01   2.358 0.018375 *  
## clarity24    4.588e-01  1.247e-01   3.679 0.000235 ***
## clarity25    2.049e-01  1.683e-01   1.217 0.223466    
## clarity26    3.460e-01  1.564e-01   2.213 0.026921 *  
## clarity27    1.328e+00  2.911e-01   4.563 5.05e-06 ***
## clarity28    3.231e-01  1.976e-01   1.635 0.102134    
## clarity29    8.404e+00  1.970e+02   0.043 0.965968    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 14280  on 53883  degrees of freedom
## Residual deviance: 12498  on 53866  degrees of freedom
## AIC: 12534
## 
## Number of Fisher Scoring iterations: 10

observations From above mentioned Null and residual dveiances we can state that “cut” is not significantly dependent only on aforementioned variables as adding them has reduced “residual deviance” not so significantly.

Exploratory Data Analysis

# Average price with carat & cut

Avg.Price1 <- group_by(df,carat,cut)
Avg.Price1 <- arrange(select(Avg.Price1,carat,cut,price),carat)
Avg.Price1 <- summarise(Avg.Price1,Average.price = mean(price))
head(Avg.Price1)
## # A tibble: 6 x 3
## # Groups:   carat [3]
##   carat cut   Average.price
##   <dbl> <fct>         <dbl>
## 1  0.2  3              367 
## 2  0.2  4              364.
## 3  0.2  5              367 
## 4  0.21 4              380.
## 5  0.21 5              386 
## 6  0.22 1              337
# Average price with carat & clarity

Avg.Price2 <- group_by(df,carat,clarity)
Avg.Price2 <- arrange(select(Avg.Price2,carat,clarity,price),carat)
Avg.Price2 <- summarise(Avg.Price2,Average.price = mean(price))
head(Avg.Price2)
## # A tibble: 6 x 3
## # Groups:   carat [3]
##   carat clarity Average.price
##   <dbl> <fct>           <dbl>
## 1  0.2  24                345
## 2  0.2  26                367
## 3  0.21 23                326
## 4  0.21 24                394
## 5  0.21 26                386
## 6  0.22 23                406
# Average price with carat & color
Avg.Price3 <- group_by(df,carat,color)
Avg.Price3 <- arrange(select(Avg.Price3,carat,color,price),carat)
Avg.Price3 <- summarise(Avg.Price3,Average.price = mean(price))
head(Avg.Price3)
## # A tibble: 6 x 3
## # Groups:   carat [3]
##   carat color Average.price
##   <dbl> <fct>         <dbl>
## 1  0.2  12             367 
## 2  0.2  13             364.
## 3  0.2  14             367 
## 4  0.21 12             386 
## 5  0.21 13             376.
## 6  0.22 12             404

Visual Data Analysis

ggplot.price <- ggplot(df, aes(x = price))+
                geom_histogram(bins = 100)+
                labs(title = "Distribution of Price")
ggplotly(ggplot.price)
ggplot.cut <- ggplot(df, aes(x = cut))+
              geom_histogram(stat = 'count',fill='red',binwidth = 2)+
              labs(title = 'Distribution of Cut')
## Warning: Ignoring unknown parameters: binwidth, bins, pad
ggplotly(ggplot.cut)

observation #In overall distribution of data, “Ideal” type of cut has highest count of 21545 followed by “Premium” and “Very Good”,each with count of 13777 and 12078 respectively.

ggplot.carat.cut <- ggplot(df,aes(x = cut, y = carat))+
                    geom_jitter(aes(colour = cut))+
                    labs(title = "Distribution of carat in each cut",x = "cut",y = "carat")
ggplot.carat.cut

ggplot.depth.price<-ggplot(df,aes(x = carat,y = price))+
                    geom_point()+stat_smooth(method = 'lm',position = 'identity')+
                    labs(title = "Price vs carat", x = "carat",y = "price",
                    subtitle = paste("Correlation between carat and price",cor(df$carat,df$price)))

ggplot.depth.price