ls()
## character(0)
rm(list = ls())
ls()
## character(0)
library(ggplot2)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v tibble 3.1.6 v dplyr 1.0.7
## v tidyr 1.1.4 v stringr 1.4.0
## v readr 2.1.0 v forcats 0.5.1
## v purrr 0.3.4
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
recoded <- diamonds %>%
filter(color=="D" | color=="J") %>%
mutate(col = as.factor(color))
head(recoded,3)
## # A tibble: 3 x 11
## carat cut color clarity depth table price x y z col
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <ord>
## 1 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75 J
## 2 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48 J
## 3 0.3 Good J SI1 64 55 339 4.25 4.28 2.73 J
recoded %>% group_by(col) %>% summarize(m=mean(price))
## # A tibble: 2 x 2
## col m
## <ord> <dbl>
## 1 D 3170.
## 2 J 5324.
str(recoded)
## tibble [9,583 x 11] (S3: tbl_df/tbl/data.frame)
## $ carat : num [1:9583] 0.31 0.24 0.3 0.23 0.31 0.3 0.3 0.3 0.31 0.31 ...
## $ cut : Ord.factor w/ 5 levels "Fair"<"Good"<..: 2 3 2 5 5 2 2 3 3 3 ...
## $ color : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 7 7 7 7 7 7 7 7 7 7 ...
## $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 6 3 5 2 3 3 3 3 3 ...
## $ depth : num [1:9583] 63.3 62.8 64 62.8 62.2 63.4 63.8 62.7 59.4 58.1 ...
## $ table : num [1:9583] 58 57 55 56 54 54 56 59 62 62 ...
## $ price : int [1:9583] 335 336 339 340 344 351 351 351 353 353 ...
## $ x : num [1:9583] 4.34 3.94 4.25 3.93 4.35 4.23 4.23 4.21 4.39 4.44 ...
## $ y : num [1:9583] 4.35 3.96 4.28 3.9 4.37 4.29 4.26 4.27 4.43 4.47 ...
## $ z : num [1:9583] 2.75 2.48 2.73 2.46 2.71 2.7 2.71 2.66 2.62 2.59 ...
## $ col : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 7 7 7 7 7 7 7 7 7 7 ...
summary(recoded)
## carat cut color clarity depth
## Min. :0.2000 Fair : 282 D:6775 SI1 :2833 Min. :43.00
## 1st Qu.:0.4000 Good : 969 E: 0 VS2 :2428 1st Qu.:61.00
## Median :0.7000 Very Good:2191 F: 0 SI2 :1849 Median :61.90
## Mean :0.8056 Premium :2411 G: 0 VS1 :1247 Mean :61.75
## 3rd Qu.:1.0300 Ideal :3730 H: 0 VVS2 : 684 3rd Qu.:62.60
## Max. :5.0100 I: 0 VVS1 : 326 Max. :73.60
## J:2808 (Other): 216
## table price x y
## Min. :51.60 Min. : 335.0 Min. : 0.00 Min. : 0.000
## 1st Qu.:56.00 1st Qu.: 976.5 1st Qu.: 4.73 1st Qu.: 4.740
## Median :57.00 Median : 2310.0 Median : 5.67 Median : 5.680
## Mean :57.52 Mean : 3801.1 Mean : 5.74 Mean : 5.743
## 3rd Qu.:59.00 3rd Qu.: 5084.5 3rd Qu.: 6.53 3rd Qu.: 6.530
## Max. :73.00 Max. :18710.0 Max. :10.74 Max. :10.540
##
## z col
## Min. :0.000 D:6775
## 1st Qu.:2.930 E: 0
## Median :3.500 F: 0
## Mean :3.545 G: 0
## 3rd Qu.:4.030 H: 0
## Max. :6.980 I: 0
## J:2808
recoded %>% ggplot(aes(price))+geom_density(col="blue", fill="blue", alpha=0.05)+theme_classic()

t.test(price ~ col, data = recoded)
##
## Welch Two Sample t-test
##
## data: price by col
## t = -23.121, df = 4197.9, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group D and group J is not equal to 0
## 95 percent confidence interval:
## -2336.496 -1971.232
## sample estimates:
## mean in group D mean in group J
## 3169.954 5323.818
recode2<-lm(price~col, data=recoded)
summary(recode2)
##
## Call:
## lm(formula = price ~ col, data = recoded)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4989 -2367 -1288 1271 15523
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4246.89 41.59 102.11 <2e-16 ***
## col.L 1523.01 58.82 25.89 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3706 on 9581 degrees of freedom
## Multiple R-squared: 0.0654, Adjusted R-squared: 0.0653
## F-statistic: 670.4 on 1 and 9581 DF, p-value: < 2.2e-16