Diamond dataset is inside the ggplot library.
library(ggplot2)
Attaching package: <U+393C><U+3E31>ggplot2<U+393C><U+3E32>
The following object is masked _by_ <U+393C><U+3E31>.GlobalEnv<U+393C><U+3E32>:
diamonds
data=diamonds
#Getting structure of Diamond dataset
str(data)
Classes tbl_df, tbl and 'data.frame': 53940 obs. of 11 variables:
$ carat : num 0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
$ cut : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
$ color : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
$ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
$ depth : num 61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
$ table : num 55 61 65 58 58 57 57 55 61 61 ...
$ price : int 326 326 327 334 335 336 336 337 337 338 ...
$ x : num 3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
$ y : num 3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
$ z : num 2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...
$ volume : num 38.2 34.5 38.1 46.7 51.9 ...
Cut, Color and Clarity are factor variables and other are numerical variables.
# Histogram of price
ggplot(aes(x=price),data=diamonds)+geom_histogram()
Histogram is skewed right skewed.
#Summary statistics
summary(diamonds$price)
Min. 1st Qu. Median Mean 3rd Qu. Max.
326 950 2401 3933 5324 18820
Answering the following questions 1. How many cost less than U$500? 2. How many cost less than U$250? 3. How many cost equal to U$15,000 or more?
#Cost less than US$500
sum(diamonds$price<500)
[1] 1729
#Cost less than US$250
sum(diamonds$price<250)
[1] 0
#cost equal to U$15,000 or more
sum(diamonds$price>=15000)
[1] 1656
# Explore the largest peak in the
ggplot(aes(x=price),data=diamonds)+geom_histogram(binwidth = 1000,col='red')+ggtitle('Histogram of the price')+ylab('Frequency')+xlab('Diamond price')
# Break out the histogram of diamond prices by cut.
ggplot(aes(x=price),data=diamonds)+geom_histogram(binwidth = 500,col='red')+ggtitle('Histogram of the price')+ylab('Frequency')+xlab('Diamond price')+facet_wrap(~cut)+theme_minimal()
#Higest price diamond
subset(diamonds,price==max(price))
Premimum cut has maximum price diamond.
#Lowest Price Diamond
subset(diamonds,price==min(price))
Ideal and premium has lowest price diamonds.
To find lowest mean of the diamond cuts
#Subsetting the diamonds by cut
Fair = diamonds[which(diamonds$cut == "Fair"),]
Good = diamonds[which(diamonds$cut == "Good"),]
VaryGood = diamonds[which(diamonds$cut == "Very Good"),]
Premium = diamonds[which(diamonds$cut == "Premium"),]
Ideal = diamonds[which(diamonds$cut == "Ideal"),]
mean(Fair$price)
[1] 4358.758
mean(Good$price)
[1] 3928.864
mean(VaryGood$price)
[1] 3981.76
mean(Premium$price)
[1] 4584.258
mean(Ideal$price)
[1] 3457.542
In the previous histogram, Scles of y was same for all the cuts. So it was hard to interpret from graph. Now we are changing the scale by just adding scales=free_y
ggplot(aes(x=price),data=diamonds)+geom_histogram(binwidth = 500,col='red')+ggtitle('Histogram of the price')+ylab('Frequency')+xlab('Diamond price')+facet_wrap(~cut,scales="free_y")+theme_minimal()
Now figure out price per caret by cut.
#Histogram of price per caret by cut
ggplot(aes(x=price/carat),data=diamonds)+geom_histogram(binwidth = 500,col='red')+ggtitle('Histogram of the price per carat')+ylab('Frequency')+xlab('Diamond price per carat')+facet_wrap(~cut,scales="free_y")+theme_minimal()
Using log10 for x
ggplot(aes(x=price/carat),data=diamonds)+geom_histogram(binwidth = 0.1,col='red')+ggtitle('Histogram of the price per carat')+ylab('Frequency')+xlab('Diamond price per carat')+facet_wrap(~cut,scales="free_y")+theme_minimal()+scale_x_log10()
#Plot price and carat by cut
ggplot(aes(x=price,y=carat),data=diamonds)+geom_line()+ylab('carat')+xlab('Diamond price')+facet_wrap(~cut,scales="free_y")+theme_minimal()
Now it’s term of some interesting boxplots
# Investigate the price of diamonds using box plots
ggplot(diamonds,aes(factor(cut),price,fill=cut))+geom_boxplot()+ggtitle('Boxplot of price by cut')
# Investigate the price of diamonds using box plots
ggplot(diamonds,aes(factor(color),price,fill=color))+geom_boxplot()+ggtitle('Boxplot of price by color')
#Subsetting the diamonds by color
D = subset(diamonds,diamonds$color == "D")
E = subset(diamonds,diamonds$color == "E")
F = subset(diamonds,diamonds$color == "F")
G = subset(diamonds,diamonds$color == "G")
H = subset(diamonds,diamonds$color == "H")
I = subset(diamonds,diamonds$color == "I")
J = subset(diamonds,diamonds$color == "J")
summary(D)
carat cut color clarity depth table
Min. :0.2000 Fair : 163 D:6775 SI1 :2083 Min. :52.2 Min. :52.0
1st Qu.:0.3600 Good : 662 E: 0 VS2 :1697 1st Qu.:61.0 1st Qu.:56.0
Median :0.5300 Very Good:1513 F: 0 SI2 :1370 Median :61.8 Median :57.0
Mean :0.6578 Premium :1603 G: 0 VS1 : 705 Mean :61.7 Mean :57.4
3rd Qu.:0.9050 Ideal :2834 H: 0 VVS2 : 553 3rd Qu.:62.5 3rd Qu.:59.0
Max. :3.4000 I: 0 VVS1 : 252 Max. :71.6 Max. :73.0
J: 0 (Other): 115
price x y z volume
Min. : 357 Min. :0.000 Min. :0.000 Min. :0.000 Min. : 0.00
1st Qu.: 911 1st Qu.:4.590 1st Qu.:4.600 1st Qu.:2.820 1st Qu.: 59.56
Median : 1838 Median :5.230 Median :5.240 Median :3.220 Median : 87.93
Mean : 3170 Mean :5.417 Mean :5.421 Mean :3.343 Mean :107.19
3rd Qu.: 4214 3rd Qu.:6.180 3rd Qu.:6.180 3rd Qu.:3.840 3rd Qu.:146.40
Max. :18693 Max. :9.420 Max. :9.340 Max. :6.270 Max. :551.65
summary(J)
carat cut color clarity depth table
Min. :0.230 Fair :119 D: 0 SI1 :750 Min. :43.00 Min. :51.60
1st Qu.:0.710 Good :307 E: 0 VS2 :731 1st Qu.:61.20 1st Qu.:56.00
Median :1.110 Very Good:678 F: 0 VS1 :542 Median :62.00 Median :58.00
Mean :1.162 Premium :808 G: 0 SI2 :479 Mean :61.89 Mean :57.81
3rd Qu.:1.520 Ideal :896 H: 0 VVS2 :131 3rd Qu.:62.70 3rd Qu.:59.00
Max. :5.010 I: 0 VVS1 : 74 Max. :73.60 Max. :68.00
J:2808 (Other):101
price x y z volume
Min. : 335 Min. : 3.930 Min. : 3.900 Min. :2.460 Min. : 37.7
1st Qu.: 1860 1st Qu.: 5.700 1st Qu.: 5.718 1st Qu.:3.530 1st Qu.:115.4
Median : 4234 Median : 6.640 Median : 6.630 Median :4.110 Median :181.3
Mean : 5324 Mean : 6.519 Mean : 6.518 Mean :4.033 Mean :188.5
3rd Qu.: 7695 3rd Qu.: 7.380 3rd Qu.: 7.380 3rd Qu.:4.580 3rd Qu.:248.0
Max. :18710 Max. :10.740 Max. :10.540 Max. :6.980 Max. :790.1
#IQR of best color
IQR(D$price)
[1] 3302.5
#IQR of worst color
IQR(J$price)
[1] 5834.5
# Investigate the price per carat of diamonds using box plots
ggplot(diamonds,aes(factor(color),price/carat,fill=color))+geom_boxplot()+ggtitle('Boxplot of price by color')
#Frequency polygon
ggplot(data=diamonds, aes(x=carat)) + geom_freqpoly() + ggtitle("Diamond Frequency by Carat")
# scatterplot of price vs x.
ggplot(data=diamonds,aes(x=price,y=x))+geom_point()
#Correlation of price and x
cor.test(data$price,data$x)
Pearson's product-moment correlation
data: data$price and data$x
t = 440.16, df = 53938, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.8825835 0.8862594
sample estimates:
cor
0.8844352
#Correlation of price and y
cor.test(data$price,data$y)
Pearson's product-moment correlation
data: data$price and data$y
t = 401.14, df = 53938, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.8632867 0.8675241
sample estimates:
cor
0.8654209
#Correlation of price and Z
cor.test(data$price,data$z)
Pearson's product-moment correlation
data: data$price and data$z
t = 393.6, df = 53938, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.8590541 0.8634131
sample estimates:
cor
0.8612494
#Create a simple scatter plot of price vs depth
ggplot(data = diamonds, aes(x = depth, y = price)) +
geom_point(alpha=1/100)+scale_x_continuous(breaks=seq(50,80,1))
#Correlation of depth and price
cor.test(diamonds$depth,diamonds$price)
Pearson's product-moment correlation
data: diamonds$depth and diamonds$price
t = -2.473, df = 53938, p-value = 0.0134
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.019084756 -0.002208537
sample estimates:
cor
-0.0106474
#Create a scatterplot of price vs carat
#and omit the top 1% of price and carat
ggplot(aes(carat,price),data=diamonds)+geom_point(position = position_jitter(h=0))
# Create a scatterplot of price vs. volume (x * y * z)
# Create a new variable for volume in the diamonds data frame.
diamonds$volume=diamonds$x*diamonds$y*diamonds$z
ggplot(data=diamonds,aes(x=volume,y=price))+geom_point()
#Count of diamonds whoes volume 0 and greater than 800
library(dplyr)
Attaching package: <U+393C><U+3E31>dplyr<U+393C><U+3E32>
The following objects are masked from <U+393C><U+3E31>package:stats<U+393C><U+3E32>:
filter, lag
The following objects are masked from <U+393C><U+3E31>package:base<U+393C><U+3E32>:
intersect, setdiff, setequal, union
diamond_subset=filter(diamonds,!( diamonds$volume >=800 | diamonds$volume==0 ))
cor.test(diamond_subset$volume,diamond_subset$price)
Pearson's product-moment correlation
data: diamond_subset$volume and diamond_subset$price
t = 559.19, df = 53915, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.9222944 0.9247772
sample estimates:
cor
0.9235455
#Scatterplot of volume and price excluding volume 0 and greater than 800
ggplot(aes(x=volume,y=price),data=diamond_subset)+geom_point()+geom_smooth()
#the data frame diamondsByClarity
diamonds_clarity=group_by(diamonds,clarity)
diamondsByClarity=summarise(diamonds_clarity,clarity_maen=mean(as.numeric(clarity)),clarity_median=median(as.numeric(clarity)),n=n())
#First top 6 rows
head(diamonds,6)
#last top 6 rows
tail(diamondsByClarity,6)
# Group by clarity and color
diamonds_by_clarity <- group_by(diamonds, clarity)
diamonds_mp_by_clarity <- summarise(diamonds_by_clarity, mean_price = mean(price))
diamonds_by_color <- group_by(diamonds, color)
diamonds_mp_by_color <- summarise(diamonds_by_color, mean_price = mean(price))
#Barplot of clarity
clarity=ggplot(aes(x=clarity,y=mean_price),data=diamonds_mp_by_clarity)+geom_bar(stat="identity",col='red')
color=ggplot(aes(x=color,y=mean_price),data=diamonds_mp_by_color)+geom_bar(stat="identity",col='blue')
#Histogram of price of different colors
ggplot(aes(x=price),data = diamonds)+geom_histogram(binwidth = 500)+facet_wrap(~color)+scale_fill_brewer(type='qual')
# Create a scatterplot of diamond price vs cut
ggplot(aes(x=price,y=table),data=diamonds)+geom_point()+scale_color_brewer(color,type = 'qual')
#scatterplot of diamond price vs volumn
ggplot(aes(x=price,y=log(x*y*z),color=clarity),data=diamonds)+geom_point()+scale_color_brewer(type='div')
ggpairs(diamond_shap)
plot: [1,1] [=---------------------------------------------------------------] 1% est: 0s
plot: [1,2] [=---------------------------------------------------------------] 2% est:23s
plot: [1,3] [==--------------------------------------------------------------] 2% est:35s
plot: [1,4] [==--------------------------------------------------------------] 3% est:35s
plot: [1,5] [===-------------------------------------------------------------] 4% est:37s
plot: [1,6] [===-------------------------------------------------------------] 5% est:34s
plot: [1,7] [====------------------------------------------------------------] 6% est:33s
plot: [1,8] [====------------------------------------------------------------] 7% est:34s
plot: [1,9] [=====-----------------------------------------------------------] 7% est:33s
plot: [1,10] [=====----------------------------------------------------------] 8% est:31s
plot: [1,11] [======---------------------------------------------------------] 9% est:30s
plot: [2,1] [======----------------------------------------------------------] 10% est:30s
plot: [2,2] [=======---------------------------------------------------------] 11% est:39s
plot: [2,3] [=======---------------------------------------------------------] 12% est:38s
plot: [2,4] [========--------------------------------------------------------] 12% est:41s
plot: [2,5] [========--------------------------------------------------------] 13% est:45s
plot: [2,6] [=========-------------------------------------------------------] 14% est:44s
plot: [2,7] [==========------------------------------------------------------] 15% est:44s
plot: [2,8] [==========------------------------------------------------------] 16% est:43s
plot: [2,10] [===========----------------------------------------------------] 17% est:44s
plot: [2,11] [===========----------------------------------------------------] 18% est:44s
plot: [3,1] [============----------------------------------------------------] 19% est:44s
plot: [3,2] [=============---------------------------------------------------] 20% est:48s
plot: [3,3] [=============---------------------------------------------------] 21% est:49s
plot: [3,4] [==============--------------------------------------------------] 21% est:47s
plot: [3,5] [==============--------------------------------------------------] 22% est:48s
plot: [3,8] [================------------------------------------------------] 25% est:46s
plot: [3,9] [================------------------------------------------------] 26% est:45s
plot: [3,10] [=================----------------------------------------------] 26% est:44s
plot: [3,11] [=================----------------------------------------------] 27% est:44s
plot: [4,1] [==================----------------------------------------------] 28% est:43s
plot: [4,2] [===================---------------------------------------------] 29% est:44s
plot: [4,3] [===================---------------------------------------------] 30% est:44s
plot: [4,4] [====================--------------------------------------------] 31% est:45s
plot: [4,5] [====================--------------------------------------------] 31% est:43s
plot: [4,6] [=====================-------------------------------------------] 32% est:43s
plot: [4,7] [=====================-------------------------------------------] 33% est:42s
plot: [4,10] [======================-----------------------------------------] 36% est:40s
plot: [4,11] [=======================----------------------------------------] 36% est:39s
plot: [5,1] [========================----------------------------------------] 37% est:39s
plot: [5,2] [========================----------------------------------------] 38% est:38s
plot: [5,3] [=========================---------------------------------------] 39% est:38s
plot: [5,4] [=========================---------------------------------------] 40% est:38s
plot: [5,5] [==========================--------------------------------------] 40% est:38s
plot: [5,6] [==========================--------------------------------------] 41% est:37s
plot: [5,7] [===========================-------------------------------------] 42% est:36s
plot: [5,8] [============================------------------------------------] 43% est:35s
plot: [5,9] [============================------------------------------------] 44% est:34s
plot: [5,10] [============================-----------------------------------] 45% est:33s
plot: [5,11] [=============================----------------------------------] 45% est:32s
plot: [6,1] [==============================----------------------------------] 46% est:31s
plot: [6,2] [==============================----------------------------------] 47% est:30s
plot: [6,4] [===============================---------------------------------] 49% est:30s
plot: [6,5] [================================--------------------------------] 50% est:30s
plot: [6,6] [================================--------------------------------] 50% est:30s
plot: [6,7] [=================================-------------------------------] 51% est:29s
plot: [6,8] [=================================-------------------------------] 52% est:28s
plot: [6,9] [==================================------------------------------] 53% est:27s
plot: [6,10] [==================================-----------------------------] 54% est:27s
plot: [6,11] [==================================-----------------------------] 55% est:26s
plot: [7,1] [===================================-----------------------------] 55% est:25s
plot: [7,2] [====================================----------------------------] 56% est:25s
plot: [7,3] [====================================----------------------------] 57% est:24s
plot: [7,5] [======================================--------------------------] 59% est:24s
plot: [7,6] [======================================--------------------------] 60% est:24s
plot: [7,7] [=======================================-------------------------] 60% est:23s
plot: [7,8] [=======================================-------------------------] 61% est:22s
plot: [7,9] [========================================------------------------] 62% est:22s
plot: [7,10] [========================================-----------------------] 63% est:21s
plot: [7,11] [========================================-----------------------] 64% est:20s
plot: [8,1] [=========================================-----------------------] 64% est:20s
plot: [8,2] [==========================================----------------------] 65% est:19s
plot: [8,3] [==========================================----------------------] 66% est:19s
plot: [8,4] [===========================================---------------------] 67% est:19s
plot: [8,5] [===========================================---------------------] 68% est:18s
plot: [8,6] [============================================--------------------] 69% est:18s
plot: [8,7] [============================================--------------------] 69% est:17s
plot: [8,8] [=============================================-------------------] 70% est:17s
plot: [8,9] [=============================================-------------------] 71% est:16s
plot: [9,2] [================================================----------------] 74% est:14s
plot: [9,3] [================================================----------------] 75% est:13s
plot: [9,4] [=================================================---------------] 76% est:13s
plot: [9,5] [=================================================---------------] 77% est:13s
plot: [9,6] [==================================================--------------] 78% est:12s
plot: [9,7] [==================================================--------------] 79% est:12s
plot: [9,8] [===================================================-------------] 79% est:11s
plot: [9,9] [===================================================-------------] 80% est:11s
plot: [9,10] [===================================================------------] 81% est:10s
plot: [9,11] [====================================================-----------] 82% est:10s
plot: [10,1] [====================================================-----------] 83% est: 9s
plot: [10,2] [=====================================================----------] 83% est: 9s
plot: [10,3] [=====================================================----------] 84% est: 8s
plot: [10,5] [======================================================---------] 86% est: 8s
plot: [10,6] [=======================================================--------] 87% est: 7s
plot: [10,7] [=======================================================--------] 88% est: 7s
plot: [10,8] [========================================================-------] 88% est: 6s
plot: [10,9] [========================================================-------] 89% est: 6s
plot: [10,10] [========================================================------] 90% est: 5s
plot: [10,11] [========================================================------] 91% est: 5s
plot: [11,1] [==========================================================-----] 92% est: 4s
plot: [11,2] [==========================================================-----] 93% est: 4s
plot: [11,3] [===========================================================----] 93% est: 3s
plot: [11,4] [===========================================================----] 94% est: 3s
plot: [11,5] [============================================================---] 95% est: 3s
plot: [11,6] [============================================================---] 96% est: 2s
plot: [11,7] [=============================================================--] 97% est: 2s
plot: [11,8] [=============================================================--] 98% est: 1s
plot: [11,9] [==============================================================-] 98% est: 1s
plot: [11,10] [=============================================================-] 99% est: 0s
plot: [11,11] [==============================================================]100% est: 0s
library(gridExtra)
plot1 <- ggplot(aes(x=price),data=diamonds)+geom_histogram(binwidth = 0.5,color='red')
plot2 <- ggplot(aes(x=log(price)),data=diamonds)+geom_histogram(binwidth = 0.05,color='blue')
grid.arrange(plot1,plot2,ncol=2)
#Tranforming plot
ggplot(aes(x=carat,y=price),data=diamonds)+geom_point(color='blue')+scale_y_continuous(trans = log10_trans())
head(sort(table(diamonds$carat),decreasing = T))
0.3 0.31 1.01 0.7 0.32 1
2604 2249 2242 1981 1840 1558
#Price table
head(sort(table(diamonds$price),decreasing = T))
605 802 625 828 776 698
132 127 126 125 124 121
summary(m1)
Call:
lm(formula = I(log(price)) ~ I(carat^(1/3)) + carat + cut + clarity,
data = diamonds)
Residuals:
Min 1Q Median 3Q Max
-0.7668 -0.1146 0.0112 0.1194 1.9524
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.3910549 0.0138622 28.210 < 2e-16 ***
I(carat^(1/3)) 9.3762150 0.0226614 413.753 < 2e-16 ***
carat -1.2744145 0.0083137 -153.292 < 2e-16 ***
cut.L 0.1245624 0.0031901 39.047 < 2e-16 ***
cut.Q -0.0339681 0.0028065 -12.103 < 2e-16 ***
cut.C 0.0162325 0.0024369 6.661 2.74e-11 ***
cut^4 -0.0009871 0.0019522 -0.506 0.6131
clarity.L 0.8542091 0.0048155 177.389 < 2e-16 ***
clarity.Q -0.2390985 0.0045212 -52.883 < 2e-16 ***
clarity.C 0.1291347 0.0038714 33.356 < 2e-16 ***
clarity^4 -0.0796756 0.0030898 -25.786 < 2e-16 ***
clarity^5 0.0336857 0.0025232 13.350 < 2e-16 ***
clarity^6 0.0036529 0.0021992 1.661 0.0967 .
clarity^7 0.0513649 0.0019377 26.508 < 2e-16 ***
---
Signif. codes: 0 *** 0.001 ** 0.01 * 0.05 . 0.1 1
Residual standard error: 0.1813 on 53926 degrees of freedom
Multiple R-squared: 0.9681, Adjusted R-squared: 0.9681
F-statistic: 1.258e+05 on 13 and 53926 DF, p-value: < 2.2e-16