ls()
## character(0)
rm(list = ls())
ls()
## character(0)
library(ggplot2)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v tibble  3.1.6     v dplyr   1.0.7
## v tidyr   1.1.4     v stringr 1.4.0
## v readr   2.1.0     v forcats 0.5.1
## v purrr   0.3.4
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
recoded <- diamonds %>%
  filter(color=="D" | color=="J") %>%
  mutate(col = as.factor(color))
head(recoded,3)
## # A tibble: 3 x 11
##   carat cut       color clarity depth table price     x     y     z col  
##   <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl> <ord>
## 1  0.31 Good      J     SI2      63.3    58   335  4.34  4.35  2.75 J    
## 2  0.24 Very Good J     VVS2     62.8    57   336  3.94  3.96  2.48 J    
## 3  0.3  Good      J     SI1      64      55   339  4.25  4.28  2.73 J
recoded %>% group_by(col) %>% summarize(m=mean(price))
## # A tibble: 2 x 2
##   col       m
##   <ord> <dbl>
## 1 D     3170.
## 2 J     5324.
str(recoded)
## tibble [9,583 x 11] (S3: tbl_df/tbl/data.frame)
##  $ carat  : num [1:9583] 0.31 0.24 0.3 0.23 0.31 0.3 0.3 0.3 0.31 0.31 ...
##  $ cut    : Ord.factor w/ 5 levels "Fair"<"Good"<..: 2 3 2 5 5 2 2 3 3 3 ...
##  $ color  : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 7 7 7 7 7 7 7 7 7 7 ...
##  $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 6 3 5 2 3 3 3 3 3 ...
##  $ depth  : num [1:9583] 63.3 62.8 64 62.8 62.2 63.4 63.8 62.7 59.4 58.1 ...
##  $ table  : num [1:9583] 58 57 55 56 54 54 56 59 62 62 ...
##  $ price  : int [1:9583] 335 336 339 340 344 351 351 351 353 353 ...
##  $ x      : num [1:9583] 4.34 3.94 4.25 3.93 4.35 4.23 4.23 4.21 4.39 4.44 ...
##  $ y      : num [1:9583] 4.35 3.96 4.28 3.9 4.37 4.29 4.26 4.27 4.43 4.47 ...
##  $ z      : num [1:9583] 2.75 2.48 2.73 2.46 2.71 2.7 2.71 2.66 2.62 2.59 ...
##  $ col    : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 7 7 7 7 7 7 7 7 7 7 ...
summary(recoded)
##      carat               cut       color       clarity         depth      
##  Min.   :0.2000   Fair     : 282   D:6775   SI1    :2833   Min.   :43.00  
##  1st Qu.:0.4000   Good     : 969   E:   0   VS2    :2428   1st Qu.:61.00  
##  Median :0.7000   Very Good:2191   F:   0   SI2    :1849   Median :61.90  
##  Mean   :0.8056   Premium  :2411   G:   0   VS1    :1247   Mean   :61.75  
##  3rd Qu.:1.0300   Ideal    :3730   H:   0   VVS2   : 684   3rd Qu.:62.60  
##  Max.   :5.0100                    I:   0   VVS1   : 326   Max.   :73.60  
##                                    J:2808   (Other): 216                  
##      table           price               x               y         
##  Min.   :51.60   Min.   :  335.0   Min.   : 0.00   Min.   : 0.000  
##  1st Qu.:56.00   1st Qu.:  976.5   1st Qu.: 4.73   1st Qu.: 4.740  
##  Median :57.00   Median : 2310.0   Median : 5.67   Median : 5.680  
##  Mean   :57.52   Mean   : 3801.1   Mean   : 5.74   Mean   : 5.743  
##  3rd Qu.:59.00   3rd Qu.: 5084.5   3rd Qu.: 6.53   3rd Qu.: 6.530  
##  Max.   :73.00   Max.   :18710.0   Max.   :10.74   Max.   :10.540  
##                                                                    
##        z         col     
##  Min.   :0.000   D:6775  
##  1st Qu.:2.930   E:   0  
##  Median :3.500   F:   0  
##  Mean   :3.545   G:   0  
##  3rd Qu.:4.030   H:   0  
##  Max.   :6.980   I:   0  
##                  J:2808
recoded %>% ggplot(aes(price))+geom_density(col="blue", fill="blue", alpha=0.05)+theme_classic()

t.test(price ~ col, data = recoded)
## 
##  Welch Two Sample t-test
## 
## data:  price by col
## t = -23.121, df = 4197.9, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group D and group J is not equal to 0
## 95 percent confidence interval:
##  -2336.496 -1971.232
## sample estimates:
## mean in group D mean in group J 
##        3169.954        5323.818
recode2<-lm(price~col, data=recoded)
summary(recode2)
## 
## Call:
## lm(formula = price ~ col, data = recoded)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -4989  -2367  -1288   1271  15523 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4246.89      41.59  102.11   <2e-16 ***
## col.L        1523.01      58.82   25.89   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3706 on 9581 degrees of freedom
## Multiple R-squared:  0.0654, Adjusted R-squared:  0.0653 
## F-statistic: 670.4 on 1 and 9581 DF,  p-value: < 2.2e-16