library(dplyr)
## Warning: package 'dplyr' was built under R version 3.4.4
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
## Warning: package 'stringr' was built under R version 3.4.4
library(polycor)
## Warning: package 'polycor' was built under R version 3.4.4
library(mice)
## Warning: package 'mice' was built under R version 3.4.4
## Loading required package: lattice
library(reshape2)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.4.4
library(plotly)
## Warning: package 'plotly' was built under R version 3.4.4
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
df <- read.csv("diamonds-project.csv", header = TRUE, stringsAsFactors = F,
colClasses = c("numeric","character","character","character","numeric","numeric","numeric",
"numeric","numeric","numeric"))
print(paste("Total no. of rows are",nrow(df)))
## [1] "Total no. of rows are 53940"
print(paste("Total no. of columns are",ncol(df)))
## [1] "Total no. of columns are 10"
head(df)
## carat cut color clarity depth table price x y z
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 4 0.29 Premium I VS2 62.4 58 334 4.20 4.23 2.63
## 5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
## 6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
str(df)
## 'data.frame': 53940 obs. of 10 variables:
## $ carat : num 0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
## $ cut : chr "Ideal" "Premium" "Good" "Premium" ...
## $ color : chr "E" "E" "E" "I" ...
## $ clarity: chr "SI2" "SI1" "VS1" "VS2" ...
## $ depth : num 61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
## $ table : num 55 61 65 58 58 57 57 55 61 61 ...
## $ price : num 326 326 327 334 335 336 336 337 337 338 ...
## $ x : num 3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
## $ y : num 3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
## $ z : num 2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...
summary(df)
## carat cut color
## Min. : 0.2000 Length:53940 Length:53940
## 1st Qu.: 0.4000 Class :character Class :character
## Median : 0.7000 Mode :character Mode :character
## Mean : 0.8164
## 3rd Qu.: 1.0400
## Max. :999.0000
## NA's :1
## clarity depth table price
## Length:53940 Min. : 43.00 Min. : 43.00 Min. : 0
## Class :character 1st Qu.: 61.00 1st Qu.: 56.00 1st Qu.: 950
## Mode :character Median : 61.80 Median : 57.00 Median : 2401
## Mean : 61.77 Mean : 57.48 Mean : 3935
## 3rd Qu.: 62.50 3rd Qu.: 59.00 3rd Qu.: 5325
## Max. :999.00 Max. :999.00 Max. :99999
## NA's :1 NA's :1 NA's :1
## x y z
## Min. : 0.000 Min. : 0.000 Min. : 0.000
## 1st Qu.: 4.710 1st Qu.: 4.720 1st Qu.: 2.910
## Median : 5.700 Median : 5.710 Median : 3.530
## Mean : 5.731 Mean : 5.735 Mean : 3.539
## 3rd Qu.: 6.540 3rd Qu.: 6.540 3rd Qu.: 4.040
## Max. :10.740 Max. :58.900 Max. :31.800
##
check for NAs
# get isNA Count
detectNAs <- function(inp) {
return(sum(is.na(inp)))
}
# detect NAs
lapply(df, FUN=detectNAs)
## $carat
## [1] 1
##
## $cut
## [1] 0
##
## $color
## [1] 0
##
## $clarity
## [1] 0
##
## $depth
## [1] 1
##
## $table
## [1] 1
##
## $price
## [1] 1
##
## $x
## [1] 0
##
## $y
## [1] 0
##
## $z
## [1] 0
# get detectZeros Count
detectZeros <- function(inp) {
if (class(inp) != "numeric") {
return ("Non Numeric Column")
}
sum(inp==0)
}
# detect Spaces
lapply(df, FUN=detectZeros)
## $carat
## [1] NA
##
## $cut
## [1] "Non Numeric Column"
##
## $color
## [1] "Non Numeric Column"
##
## $clarity
## [1] "Non Numeric Column"
##
## $depth
## [1] NA
##
## $table
## [1] NA
##
## $price
## [1] NA
##
## $x
## [1] 8
##
## $y
## [1] 7
##
## $z
## [1] 20
### There are zeros in X, Y and Z columns. Here we will convert them into NAs
df$x[df$x == 0] <- NA
df$y[df$y == 0] <- NA
df$z[df$z == 0] <- NA
df$price[df$price == 0] <-NA
## check for zeros again
lapply(df, detectZeros)
## $carat
## [1] NA
##
## $cut
## [1] "Non Numeric Column"
##
## $color
## [1] "Non Numeric Column"
##
## $clarity
## [1] "Non Numeric Column"
##
## $depth
## [1] NA
##
## $table
## [1] NA
##
## $price
## [1] NA
##
## $x
## [1] NA
##
## $y
## [1] NA
##
## $z
## [1] NA
** check for undefined data in categoric columns**
# get detectSpaces Count
detectSpaces <- function(inp) {
if (class(inp) != "character") {
return ("Non Character Column")
}
sum(trimws(inp)=="")
}
# detect spaces
lapply(df, FUN=detectSpaces)
## $carat
## [1] "Non Character Column"
##
## $cut
## [1] 1
##
## $color
## [1] 1
##
## $clarity
## [1] 1
##
## $depth
## [1] "Non Character Column"
##
## $table
## [1] "Non Character Column"
##
## $price
## [1] "Non Character Column"
##
## $x
## [1] "Non Character Column"
##
## $y
## [1] "Non Character Column"
##
## $z
## [1] "Non Character Column"
df$cut[trimws(df$cut) == ""] <- NA
df$color[trimws(df$color) == ""] <- NA
df$clarity[trimws(df$clarity) == ""] <- NA
## check again for spaces
lapply(df,detectSpaces)
## $carat
## [1] "Non Character Column"
##
## $cut
## [1] NA
##
## $color
## [1] NA
##
## $clarity
## [1] NA
##
## $depth
## [1] "Non Character Column"
##
## $table
## [1] "Non Character Column"
##
## $price
## [1] "Non Character Column"
##
## $x
## [1] "Non Character Column"
##
## $y
## [1] "Non Character Column"
##
## $z
## [1] "Non Character Column"
# get Outlier Count
detectOutliers <- function(inp, na.rm=TRUE) {
if (class(inp) != "numeric") {
return ("Non Numeric Column")
}
i.qnt <- quantile(inp, probs=c(.25, .75), na.rm=na.rm)
i.max <- 1.5 * IQR(inp, na.rm=na.rm)
otp <- inp
otp[inp < (i.qnt[1] - i.max)] <- NA
otp[inp > (i.qnt[2] + i.max)] <- NA
return(inp[is.na(otp)])
}
# detect Outliers
outliers <- lapply(df, FUN=detectOutliers)
# get outlier count
summary(outliers)
## Length Class Mode
## carat 1891 -none- numeric
## cut 1 -none- character
## color 1 -none- character
## clarity 1 -none- character
## depth 2547 -none- numeric
## table 608 -none- numeric
## price 3541 -none- numeric
## x 32 -none- numeric
## y 29 -none- numeric
## z 49 -none- numeric
### "carat" and "depth" columns have huge number of outliers. we can either delete those rows or
### we can increase our range of outliers. In our case we will increase ouroutlier range to save
### the data rows and replace the outliers with NAs
ReplaceOutliers <- function(inp, na.rm=TRUE) {
if (class(inp) != "numeric") {
return (inp)
}
i.qnt <- quantile(inp, probs=c(.25, .75), na.rm=na.rm)
i.max <- 3 * IQR(inp, na.rm=na.rm)
inp[inp < (i.qnt[1] - i.max)] <- NA
inp[inp > (i.qnt[2] + i.max)] <- NA
return(inp)
}
# detect Outliers
outliers <- sapply(df, FUN=ReplaceOutliers)
summary(outliers)
## carat cut color clarity
## 0.3 : 2604 Fair : 1610 G :11292 SI1 :13065
## 0.31 : 2249 Good : 4905 E : 9796 VS2 :12258
## 1.01 : 2242 Ideal :21550 F : 9542 SI2 : 9194
## 0.7 : 1981 Premium :13791 H : 8304 VS1 : 8169
## 0.32 : 1840 Very Good:12082 D : 6774 VVS2 : 5066
## (Other):42982 Wonderful: 1 (Other): 8231 (Other): 6187
## NA's : 42 NA's : 1 NA's : 1 NA's : 1
## depth table price x
## 62 : 2239 56 :10039 605 : 132 4.37 : 448
## 61.9 : 2163 57 : 9811 802 : 127 4.34 : 437
## 61.8 : 2077 58 : 8435 625 : 126 4.33 : 429
## 62.2 : 2039 59 : 6621 828 : 125 4.38 : 428
## 62.1 : 2020 55 : 6442 776 : 124 4.32 : 425
## (Other):43122 (Other):12562 (Other):53183 (Other):51765
## NA's : 280 NA's : 30 NA's : 123 NA's : 8
## y z
## 4.34 : 437 2.7 : 767
## 4.37 : 435 2.69 : 748
## 4.35 : 425 2.71 : 738
## 4.33 : 421 2.68 : 730
## 4.32 : 414 2.72 : 697
## (Other):51799 (Other):50238
## NA's : 9 NA's : 22
# get new outlier count
df <- data.frame(outliers,stringsAsFactors = F)
lapply(df,detectNAs)
## $carat
## [1] 42
##
## $cut
## [1] 1
##
## $color
## [1] 1
##
## $clarity
## [1] 1
##
## $depth
## [1] 280
##
## $table
## [1] 30
##
## $price
## [1] 123
##
## $x
## [1] 8
##
## $y
## [1] 9
##
## $z
## [1] 22
write.csv(x = df,file = "df.csv",row.names = FALSE)
Let’s Impute values for NAs by KNN
df <- read.csv("df.csv",stringsAsFactors = F)
#here we need to convert catrgorical variables into factors
df$cut <- as.factor(df$cut)
levels(df$cut) <-c(1,2,3,4,5,6)
df$color <- as.factor(df$color)
levels(df$color) <- c(11,12,13,14,15,16,17,18)
df$clarity <- as.factor(df$clarity)
levels(df$clarity) <- c(21,22,23,24,25,26,27,28,29)
summary(df)
## carat cut color clarity
## Min. :0.2000 1 : 1610 15 :11292 23 :13065
## 1st Qu.:0.4000 2 : 4905 13 : 9796 26 :12258
## Median :0.7000 3 :21550 14 : 9542 24 : 9194
## Mean :0.7961 4 :13791 16 : 8304 25 : 8169
## 3rd Qu.:1.0400 5 :12082 12 : 6774 28 : 5066
## Max. :2.8000 6 : 1 (Other): 8231 (Other): 6187
## NA's :42 NA's: 1 NA's : 1 NA's : 1
## depth table price x
## Min. :56.50 Min. :49.00 Min. : 326 Min. : 3.730
## 1st Qu.:61.00 1st Qu.:56.00 1st Qu.: 948 1st Qu.: 4.710
## Median :61.80 Median :57.00 Median : 2396 Median : 5.700
## Mean :61.74 Mean :57.45 Mean : 3900 Mean : 5.732
## 3rd Qu.:62.50 3rd Qu.:59.00 3rd Qu.: 5294 3rd Qu.: 6.540
## Max. :67.00 Max. :68.00 Max. :18447 Max. :10.740
## NA's :280 NA's :30 NA's :123 NA's :8
## y z
## Min. : 3.680 Min. :1.070
## 1st Qu.: 4.720 1st Qu.:2.910
## Median : 5.710 Median :3.530
## Mean : 5.734 Mean :3.539
## 3rd Qu.: 6.540 3rd Qu.:4.040
## Max. :10.540 Max. :6.980
## NA's :9 NA's :22
df <- filter(df,!(is.na(df$cut)))
df <- filter(df,!(is.na(df$color)))
df <- filter(df,!(is.na(df$clarity)))
df <- filter(df,!(is.na(df$table)))
df <- filter(df,!(is.na(df$x)))
df <- filter(df,!(is.na(df$y)))
df <- filter(df,!(is.na(df$z)))
summary(df)
## carat cut color clarity
## Min. :0.2000 1: 1582 15 :11281 23 :13056
## 1st Qu.:0.4000 2: 4901 13 : 9788 26 :12243
## Median :0.7000 3:21545 14 : 9531 24 : 9178
## Mean :0.7958 4:13777 16 : 8289 25 : 8158
## 3rd Qu.:1.0400 5:12078 12 : 6770 28 : 5066
## Max. :2.8000 6: 1 17 : 5417 27 : 3654
## NA's :42 (Other): 2808 (Other): 2529
## depth table price x
## Min. :56.50 Min. :49.00 Min. : 326 Min. : 3.730
## 1st Qu.:61.00 1st Qu.:56.00 1st Qu.: 947 1st Qu.: 4.710
## Median :61.80 Median :57.00 Median : 2396 Median : 5.700
## Mean :61.74 Mean :57.45 Mean : 3898 Mean : 5.731
## 3rd Qu.:62.50 3rd Qu.:59.00 3rd Qu.: 5294 3rd Qu.: 6.540
## Max. :67.00 Max. :68.00 Max. :18447 Max. :10.740
## NA's :271 NA's :122
## y z
## Min. : 3.680 Min. :1.070
## 1st Qu.: 4.720 1st Qu.:2.910
## Median : 5.710 Median :3.530
## Mean : 5.733 Mean :3.539
## 3rd Qu.: 6.540 3rd Qu.:4.040
## Max. :10.540 Max. :6.980
##
machine learning 1
stepModel <- step(lm(data = df, price~carat+cut+color+clarity), trace = 0, steps = 100)
summary(stepModel)
##
## Call:
## lm(formula = price ~ carat + cut + color + clarity, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8730.8 -673.8 -190.8 467.6 9847.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7332.21 1118.22 -6.557 5.54e-11 ***
## carat 8899.42 11.83 752.266 < 2e-16 ***
## cut2 640.05 32.84 19.490 < 2e-16 ***
## cut3 981.36 29.98 32.731 < 2e-16 ***
## cut4 853.61 30.25 28.216 < 2e-16 ***
## cut5 829.53 30.58 27.130 < 2e-16 ***
## cut6 -127.65 1117.54 -0.114 0.90906
## color12 295.32 1117.23 0.264 0.79152
## color13 83.64 1117.21 0.075 0.94032
## color14 -20.69 1117.21 -0.019 0.98523
## color15 -221.98 1117.21 -0.199 0.84250
## color16 -689.62 1117.22 -0.617 0.53706
## color17 -1136.88 1117.27 -1.018 0.30889
## color18 -2016.99 1117.37 -1.805 0.07106 .
## clarity22 5091.98 50.90 100.032 < 2e-16 ***
## clarity23 3244.99 43.75 74.169 < 2e-16 ***
## clarity24 2301.07 43.98 52.326 < 2e-16 ***
## clarity25 4216.25 44.62 94.487 < 2e-16 ***
## clarity26 3894.95 43.97 88.577 < 2e-16 ***
## clarity27 4756.23 47.14 100.901 < 2e-16 ***
## clarity28 4650.06 45.89 101.328 < 2e-16 ***
## clarity29 3170.97 1118.02 2.836 0.00457 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1117 on 53704 degrees of freedom
## (158 observations deleted due to missingness)
## Multiple R-squared: 0.919, Adjusted R-squared: 0.9189
## F-statistic: 2.9e+04 on 21 and 53704 DF, p-value: < 2.2e-16
observation From the above observation we can see that Adj. r-squared is high and F-statistic is also higher than p-value that means price is significantly dependent on “Carat”, “Cut”, “Color” and “Clarity” variables.
## So we can now impute NAs in price variable on forementioned variables.
library(VIM)
## Warning: package 'VIM' was built under R version 3.4.4
## Loading required package: colorspace
## Loading required package: grid
## Loading required package: data.table
## Warning: package 'data.table' was built under R version 3.4.4
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:reshape2':
##
## dcast, melt
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## VIM is ready to use.
## Since version 4.0.0 the GUI is in its own package VIMGUI.
##
## Please use the package to use the new (and old) GUI.
## Suggestions and bug-reports can be submitted at: https://github.com/alexkowa/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
df1 <- kNN(df,variable = c("price"),dist_var = c("carat", "cut", "color","clarity"))
summary(df1)
## carat cut color clarity
## Min. :0.2000 1: 1582 15 :11281 23 :13056
## 1st Qu.:0.4000 2: 4901 13 : 9788 26 :12243
## Median :0.7000 3:21545 14 : 9531 24 : 9178
## Mean :0.7958 4:13777 16 : 8289 25 : 8158
## 3rd Qu.:1.0400 5:12078 12 : 6770 28 : 5066
## Max. :2.8000 6: 1 17 : 5417 27 : 3654
## NA's :42 (Other): 2808 (Other): 2529
## depth table price x
## Min. :56.50 Min. :49.00 Min. : 326 Min. : 3.730
## 1st Qu.:61.00 1st Qu.:56.00 1st Qu.: 949 1st Qu.: 4.710
## Median :61.80 Median :57.00 Median : 2400 Median : 5.700
## Mean :61.74 Mean :57.45 Mean : 3919 Mean : 5.731
## 3rd Qu.:62.50 3rd Qu.:59.00 3rd Qu.: 5323 3rd Qu.: 6.540
## Max. :67.00 Max. :68.00 Max. :18447 Max. :10.740
## NA's :271
## y z price_imp
## Min. : 3.680 Min. :1.070 Mode :logical
## 1st Qu.: 4.720 1st Qu.:2.910 FALSE:53762
## Median : 5.710 Median :3.530 TRUE :122
## Mean : 5.733 Mean :3.539
## 3rd Qu.: 6.540 3rd Qu.:4.040
## Max. :10.540 Max. :6.980
##
df <- dplyr::select(df1, carat:z)
summary(df)
## carat cut color clarity
## Min. :0.2000 1: 1582 15 :11281 23 :13056
## 1st Qu.:0.4000 2: 4901 13 : 9788 26 :12243
## Median :0.7000 3:21545 14 : 9531 24 : 9178
## Mean :0.7958 4:13777 16 : 8289 25 : 8158
## 3rd Qu.:1.0400 5:12078 12 : 6770 28 : 5066
## Max. :2.8000 6: 1 17 : 5417 27 : 3654
## NA's :42 (Other): 2808 (Other): 2529
## depth table price x
## Min. :56.50 Min. :49.00 Min. : 326 Min. : 3.730
## 1st Qu.:61.00 1st Qu.:56.00 1st Qu.: 949 1st Qu.: 4.710
## Median :61.80 Median :57.00 Median : 2400 Median : 5.700
## Mean :61.74 Mean :57.45 Mean : 3919 Mean : 5.731
## 3rd Qu.:62.50 3rd Qu.:59.00 3rd Qu.: 5323 3rd Qu.: 6.540
## Max. :67.00 Max. :68.00 Max. :18447 Max. :10.740
## NA's :271
## y z
## Min. : 3.680 Min. :1.070
## 1st Qu.: 4.720 1st Qu.:2.910
## Median : 5.710 Median :3.530
## Mean : 5.733 Mean :3.539
## 3rd Qu.: 6.540 3rd Qu.:4.040
## Max. :10.540 Max. :6.980
##
Impute NAs for other variables
df1 <- kNN(df,variable = c("carat","depth"))
summary(df1)
## carat cut color clarity
## Min. :0.2000 1: 1582 15 :11281 23 :13056
## 1st Qu.:0.4000 2: 4901 13 : 9788 26 :12243
## Median :0.7000 3:21545 14 : 9531 24 : 9178
## Mean :0.7967 4:13777 16 : 8289 25 : 8158
## 3rd Qu.:1.0400 5:12078 12 : 6770 28 : 5066
## Max. :2.8000 6: 1 17 : 5417 27 : 3654
## (Other): 2808 (Other): 2529
## depth table price x
## Min. :56.50 Min. :49.00 Min. : 326 Min. : 3.730
## 1st Qu.:61.00 1st Qu.:56.00 1st Qu.: 949 1st Qu.: 4.710
## Median :61.80 Median :57.00 Median : 2400 Median : 5.700
## Mean :61.74 Mean :57.45 Mean : 3919 Mean : 5.731
## 3rd Qu.:62.50 3rd Qu.:59.00 3rd Qu.: 5323 3rd Qu.: 6.540
## Max. :67.00 Max. :68.00 Max. :18447 Max. :10.740
##
## y z carat_imp depth_imp
## Min. : 3.680 Min. :1.070 Mode :logical Mode :logical
## 1st Qu.: 4.720 1st Qu.:2.910 FALSE:53842 FALSE:53613
## Median : 5.710 Median :3.530 TRUE :42 TRUE :271
## Mean : 5.733 Mean :3.539
## 3rd Qu.: 6.540 3rd Qu.:4.040
## Max. :10.540 Max. :6.980
##
df <- dplyr::select(df1,-c("carat_imp","depth_imp"))
Machine Learning 2
cut.model <- step(glm(data = df, formula = factor(cut)~price + carat + color+ clarity,
family = binomial(link = "logit")), trace = 0,steps = 100)
summary(cut.model)
##
## Call:
## glm(formula = factor(cut) ~ price + carat + color + clarity,
## family = binomial(link = "logit"), data = df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.9914 0.1497 0.1980 0.2570 2.8243
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.359e+01 1.970e+02 0.069 0.944985
## price 6.425e-04 3.085e-05 20.828 < 2e-16 ***
## carat -5.465e+00 2.334e-01 -23.422 < 2e-16 ***
## color12 -8.398e+00 1.970e+02 -0.043 0.965992
## color13 -8.260e+00 1.970e+02 -0.042 0.966552
## color14 -8.523e+00 1.970e+02 -0.043 0.965485
## color15 -8.323e+00 1.970e+02 -0.042 0.966294
## color16 -8.123e+00 1.970e+02 -0.041 0.967103
## color17 -7.781e+00 1.970e+02 -0.040 0.968489
## color18 -7.476e+00 1.970e+02 -0.038 0.969723
## clarity22 1.153e+00 3.724e-01 3.097 0.001956 **
## clarity23 3.317e-01 1.407e-01 2.358 0.018375 *
## clarity24 4.588e-01 1.247e-01 3.679 0.000235 ***
## clarity25 2.049e-01 1.683e-01 1.217 0.223466
## clarity26 3.460e-01 1.564e-01 2.213 0.026921 *
## clarity27 1.328e+00 2.911e-01 4.563 5.05e-06 ***
## clarity28 3.231e-01 1.976e-01 1.635 0.102134
## clarity29 8.404e+00 1.970e+02 0.043 0.965968
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 14280 on 53883 degrees of freedom
## Residual deviance: 12498 on 53866 degrees of freedom
## AIC: 12534
##
## Number of Fisher Scoring iterations: 10
observations From above mentioned Null and residual dveiances we can state that “cut” is not significantly dependent only on aforementioned variables as adding them has reduced “residual deviance” not so significantly.
Exploratory Data Analysis
# Average price with carat & cut
Avg.Price1 <- group_by(df,carat,cut)
Avg.Price1 <- arrange(select(Avg.Price1,carat,cut,price),carat)
Avg.Price1 <- summarise(Avg.Price1,Average.price = mean(price))
head(Avg.Price1)
## # A tibble: 6 x 3
## # Groups: carat [3]
## carat cut Average.price
## <dbl> <fct> <dbl>
## 1 0.2 3 367
## 2 0.2 4 364.
## 3 0.2 5 367
## 4 0.21 4 380.
## 5 0.21 5 386
## 6 0.22 1 337
# Average price with carat & clarity
Avg.Price2 <- group_by(df,carat,clarity)
Avg.Price2 <- arrange(select(Avg.Price2,carat,clarity,price),carat)
Avg.Price2 <- summarise(Avg.Price2,Average.price = mean(price))
head(Avg.Price2)
## # A tibble: 6 x 3
## # Groups: carat [3]
## carat clarity Average.price
## <dbl> <fct> <dbl>
## 1 0.2 24 345
## 2 0.2 26 367
## 3 0.21 23 326
## 4 0.21 24 394
## 5 0.21 26 386
## 6 0.22 23 406
# Average price with carat & color
Avg.Price3 <- group_by(df,carat,color)
Avg.Price3 <- arrange(select(Avg.Price3,carat,color,price),carat)
Avg.Price3 <- summarise(Avg.Price3,Average.price = mean(price))
head(Avg.Price3)
## # A tibble: 6 x 3
## # Groups: carat [3]
## carat color Average.price
## <dbl> <fct> <dbl>
## 1 0.2 12 367
## 2 0.2 13 364.
## 3 0.2 14 367
## 4 0.21 12 386
## 5 0.21 13 376.
## 6 0.22 12 404
Visual Data Analysis
ggplot.price <- ggplot(df, aes(x = price))+
geom_histogram(bins = 100)+
labs(title = "Distribution of Price")
ggplotly(ggplot.price)
ggplot.cut <- ggplot(df, aes(x = cut))+
geom_histogram(stat = 'count',fill='red',binwidth = 2)+
labs(title = 'Distribution of Cut')
## Warning: Ignoring unknown parameters: binwidth, bins, pad
ggplotly(ggplot.cut)
observation #In overall distribution of data, “Ideal” type of cut has highest count of 21545 followed by “Premium” and “Very Good”,each with count of 13777 and 12078 respectively.
ggplot.carat.cut <- ggplot(df,aes(x = cut, y = carat))+
geom_jitter(aes(colour = cut))+
labs(title = "Distribution of carat in each cut",x = "cut",y = "carat")
ggplot.carat.cut
ggplot.depth.price<-ggplot(df,aes(x = carat,y = price))+
geom_point()+stat_smooth(method = 'lm',position = 'identity')+
labs(title = "Price vs carat", x = "carat",y = "price",
subtitle = paste("Correlation between carat and price",cor(df$carat,df$price)))
ggplot.depth.price