Libraries and dataframes.
setwd("C:/Users/tomas/Downloads/eda-course-materials")
library(dslabs)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(readr)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(reshape2)
library(quantreg)
## Loading required package: SparseM
##
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
##
## backsolve
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:readr':
##
## col_factor
library(memisc)
## Loading required package: lattice
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
##
## Attaching package: 'memisc'
## The following object is masked from 'package:scales':
##
## percent
## The following object is masked from 'package:ggplot2':
##
## syms
## The following objects are masked from 'package:dplyr':
##
## collect, recode, rename, syms
## The following objects are masked from 'package:stats':
##
## contr.sum, contr.treatment, contrasts
## The following object is masked from 'package:base':
##
## as.array
library(lattice)
library(MASS)
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:memisc':
##
## recode
## The following object is masked from 'package:dplyr':
##
## recode
data("diamonds")
diamondsbig <- read.csv("C:/Users/tomas/Downloads/eda-course-materials/lesson6/diamondsbig.csv")
First we can create a plot matrix, using a sample of the data, to see the relationships between the different variables and the price.
The upper and lower parameters define the aesthetics of the scatterplots and boxplots.
axisLabels = "internal sets the labels to the diagonal in the middle.
set.seed(16)
diamond_samp <- diamonds[sample.int(nrow(diamonds), 10000), ]
ggpairs(diamond_samp,
lower = list(continuous = wrap("points", shape = I('.'))),
upper = list(combo = wrap("box", outlier.shape = I('.'))),
axisLabels = "internal")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Price seems to be affected by cut, carat, clarity and color.
The histogram of price shows it is long-tailed, we can apply a log 10 transformation.
Carat is a product of xyz, so we should apply a cube root transformation. As no transformation exists we can create a function with a new transformation with the function function() trans_new(“NAME”, transform = function(x) TRANSFORMATION OF X, inverse = function(x) INVERSE FUNCTION).
The layer + ggtitle(“TITLE”) adds a title to the plot
cuberoot_trans <- function()trans_new('cuberoot', transform = function(x) x^(1/3),
inverse = function(x) x^3)
ggplot(aes(carat, price), data = diamonds) +
geom_point(alpha=1/5, size=0.8, position = "jitter") +
scale_x_continuous(trans = cuberoot_trans(), limits = c(0.2, 3),
breaks = c(0.2, 0.5, 1, 2, 3)) +
scale_y_continuous(trans = log10_trans(), limits = c(350, 15000),
breaks = c(350, 1000, 5000, 10000, 15000)) +
ggtitle('Price (log10) by Cube-Root of Carat')
## Warning: Removed 1693 rows containing missing values (geom_point).
Now we can add the other variables to the plot in the form of color.
With the package RColorBrewer we can add a color palette and a guide for the legend of the plot.
The modification is made with the layer scale_color_brewer(type = ‘COLOR PALETTE’, guide = guide_legend(title = ‘TITLE’, reverse = T OF F, override.aes = list(alpha = 1, size = SIZE OF THE LEGEND))).
p1 <- ggplot(aes(x = carat, y = price, color = cut), data = diamonds) +
geom_point(alpha = 0.5, size = 1, position = 'jitter') +
scale_color_brewer(type = 'div',
guide = guide_legend(title = 'Cut', reverse = T,
override.aes = list(alpha = 1, size = 2))) +
scale_x_continuous(trans = cuberoot_trans(), limits = c(0.2, 3),
breaks = c(0.2, 0.5, 1, 2, 3)) +
scale_y_continuous(trans = log10_trans(), limits = c(350, 15000),
breaks = c(350, 1000, 5000, 10000, 15000)) +
ggtitle('Price (log10) by Cube-Root of Carat and Cut')
p2 <- ggplot(aes(x = carat, y = price, color = color), data = diamonds) +
geom_point(alpha = 0.5, size = 1, position = 'jitter') +
scale_color_brewer(type = 'div',
guide = guide_legend(title = "Color", reverse = F,
override.aes = list(alpha = 1, size = 2))) +
scale_x_continuous(trans = cuberoot_trans(), limits = c(0.2, 3),
breaks = c(0.2, 0.5, 1, 2, 3)) +
scale_y_continuous(trans = log10_trans(), limits = c(350, 15000),
breaks = c(350, 1000, 5000, 10000, 15000)) +
ggtitle('Price (log10) by Cube-Root of Carat and Color')
p3 <- ggplot(aes(x = carat, y = price), data = diamonds) +
geom_point(aes(color=clarity), alpha = 0.5, size = 1, position = 'jitter') +
scale_color_brewer(type = 'div',
guide = guide_legend(title = 'Clarity', reverse = T,
override.aes = list(alpha = 1, size = 2))) +
scale_x_continuous(trans = cuberoot_trans(), limits = c(0.2, 3),
breaks = c(0.2, 0.5, 1, 2, 3)) +
scale_y_continuous(trans = log10_trans(), limits = c(350, 15000),
breaks = c(350, 1000, 5000, 10000, 15000)) +
ggtitle('Price (log10) by Cube-Root of Carat and Clarity')
grid.arrange(p1, p2, p3, ncol=1)
## Warning: Removed 1689 rows containing missing values (geom_point).
## Warning: Removed 1690 rows containing missing values (geom_point).
## Warning: Removed 1694 rows containing missing values (geom_point).
Now that we have the plots, we can create a linear model using a sample of a new bigger, dataframe.
The linear model is built with the function lm(data = DATAFRAME, I((DEPENDANT VARIABLE) ~ I(INDEPENDANT VARIABLE)), and then adding more independent variables with update(MODEL, ~ . + NEW INDEPENDANT VARIABLE).
The function mtable() displays the results from the model.
diamondsbig <- read.csv("C:/Users/tomas/Downloads/eda-course-materials/lesson6/diamondsbig.csv")
m1 <- lm(data = subset(diamondsbig[sample.int(nrow(diamondsbig), 10000), ], price<1000 & cert=="GIA"),
I(log(price) ~ I(carat^1/3)))
m2 <- update(m1, ~ . + carat)
m3 <- update(m2, ~ . + color)
m4 <- update(m3, ~ . + cut)
m5 <- update(m4, ~ . + clarity)
mtable(m5)
##
## Calls:
## m5: lm(formula = log(price) ~ I(carat^1/3) + carat + color + cut +
## clarity, data = subset(diamondsbig[sample.int(nrow(diamondsbig),
## 10000), ], price < 1000 & cert == "GIA"))
##
## ==============================
## (Intercept) 5.101***
## (0.043)
## I(carat^1/3) 9.766***
## (0.243)
## color: E/D -0.045**
## (0.014)
## color: F/D -0.060***
## (0.014)
## color: G/D -0.091***
## (0.015)
## color: H/D -0.132***
## (0.017)
## color: I/D -0.220***
## (0.018)
## color: J/D -0.330***
## (0.023)
## color: K/D -0.431***
## (0.029)
## color: L/D -0.459***
## (0.037)
## cut: Ideal 0.104***
## (0.013)
## cut: V.Good 0.062***
## (0.015)
## clarity: I2 -0.406***
## (0.079)
## clarity: IF 0.551***
## (0.030)
## clarity: SI1 0.397***
## (0.025)
## clarity: SI2 0.343***
## (0.026)
## clarity: VS1 0.445***
## (0.027)
## clarity: VS2 0.429***
## (0.026)
## clarity: VVS1 0.527***
## (0.027)
## clarity: VVS2 0.486***
## (0.027)
## ------------------------------
## R-squared 0.527
## N 1627
## ==============================
## Significance:
## *** = p < 0.001;
## ** = p < 0.01;
## * = p < 0.05
Finally, to predict the price of a diamond, we create a new dataframe with the characteristics of the diamond to predict.
Then we use the function predict(MODEL, newdata = EXAMPLE OBJECT, interval = “prediction”, level = CONFIDENCE INTERVAL) and apply it to a new vector.
thisDiamond = data.frame(carat = 1.00, cut = "V.Good",
color = "I", clarity="VS1")
modelEstimate = predict(m5, newdata = thisDiamond,
interval="prediction", level = .95)
## Warning in predict.lm(m5, newdata = thisDiamond, interval = "prediction", :
## prediction from a rank-deficient fit may be misleading
exp(modelEstimate)
## fit lwr upr
## 1 5672.888 4025.212 7995.022