Midterm1Part2and3

library(tidyverse)

## -- Attaching packages -------------------------- tidyverse 1.3.0 --

## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.3     v dplyr   1.0.2
## v tidyr   1.1.1     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0

## -- Conflicts ----------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

#Midterm Prt2: Tidyverse Data Wrangling Drills
  #view(diamonds)
 
  #Quesiton 1
    diamonds_depth_price <- data.frame(diamonds[,c(5,7)])
    head(diamonds_depth_price)

##   depth price
## 1  61.5   326
## 2  59.8   326
## 3  56.9   327
## 4  62.4   334
## 5  63.3   335
## 6  62.8   336

  #Question 2
    pricePercarat <- diamonds %>%
                        mutate(pricepercarat =   price/carat)
    
    diamonds_depth_price <- data.frame(pricePercarat[,(c(5,7,11))])
    
    head(diamonds_depth_price)

##   depth price pricepercarat
## 1  61.5   326      1417.391
## 2  59.8   326      1552.381
## 3  56.9   327      1421.739
## 4  62.4   334      1151.724
## 5  63.3   335      1080.645
## 6  62.8   336      1400.000

  #Question 3 and 4
     diamondsbyCut <- data.frame(diamonds %>%
                         group_by(cut)%>%
                         summarise(meanprice = mean(price)))

## `summarise()` ungrouping output (override with `.groups` argument)

     head(diamondsbyCut)

##         cut meanprice
## 1      Fair  4358.758
## 2      Good  3928.864
## 3 Very Good  3981.760
## 4   Premium  4584.258
## 5     Ideal  3457.542

  #Question 5
    diamondsbyColor <- diamonds%>%
                         group_by(color)%>%
                         summarise(n=n(),
                           meandepth = mean(depth,                                 na.rm = TRUE), 
                           meantable = mean(table,                                  na.rm = TRUE))

## `summarise()` ungrouping output (override with `.groups` argument)

    head(diamondsbyColor)

## # A tibble: 6 x 4
##   color     n meandepth meantable
##   <ord> <int>     <dbl>     <dbl>
## 1 D      6775      61.7      57.4
## 2 E      9797      61.7      57.5
## 3 F      9542      61.7      57.4
## 4 G     11292      61.8      57.3
## 5 H      8304      61.8      57.5
## 6 I      5422      61.8      57.6

  #Extra Credit Question
    
    diamonds<-left_join(diamonds, diamondsbyColor)

## Joining, by = "color"

    head(diamonds)

## # A tibble: 6 x 13
##   carat cut   color clarity depth table price     x     y     z     n meandepth
##   <dbl> <ord> <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl> <int>     <dbl>
## 1 0.23  Ideal E     SI2      61.5    55   326  3.95  3.98  2.43  9797      61.7
## 2 0.21  Prem~ E     SI1      59.8    61   326  3.89  3.84  2.31  9797      61.7
## 3 0.23  Good  E     VS1      56.9    65   327  4.05  4.07  2.31  9797      61.7
## 4 0.290 Prem~ I     VS2      62.4    58   334  4.2   4.23  2.63  5422      61.8
## 5 0.31  Good  J     SI2      63.3    58   335  4.34  4.35  2.75  2808      61.9
## 6 0.24  Very~ J     VVS2     62.8    57   336  3.94  3.96  2.48  2808      61.9
## # ... with 1 more variable: meantable <dbl>

  #Question 6 - Color J seems to have the biggest diamonds. 
   diamonds%>%
      group_by(color)%>%
      summarise(n=n(), 
              meancarat = mean(carat, na.rm = TRUE))

## `summarise()` ungrouping output (override with `.groups` argument)

## # A tibble: 7 x 3
##   color     n meancarat
##   <ord> <int>     <dbl>
## 1 D      6775     0.658
## 2 E      9797     0.658
## 3 F      9542     0.737
## 4 G     11292     0.771
## 5 H      8304     0.912
## 6 I      5422     1.03 
## 7 J      2808     1.16

  #Question 7 - Color G seems to be the most popular Ideal cut color
     diamonds%>%
        filter(cut == "Ideal")%>%
        group_by(color)%>%
        summarise(n=n())

## `summarise()` ungrouping output (override with `.groups` argument)

## # A tibble: 7 x 2
##   color     n
##   <ord> <int>
## 1 D      2834
## 2 E      3903
## 3 F      3826
## 4 G      4884
## 5 H      3115
## 6 I      2093
## 7 J       896

  #Question 8 - clarity VVS1 has 141 average table/carat
     diamonds%>%
       mutate(tablepercarat = table/carat)%>%
       group_by(clarity)%>%
       summarise(n=n(), 
                  meantablepercarats = mean(tablepercarat))

## `summarise()` ungrouping output (override with `.groups` argument)

## # A tibble: 8 x 3
##   clarity     n meantablepercarats
##   <ord>   <int>              <dbl>
## 1 I1        741               56.3
## 2 SI2      9194               69.1
## 3 SI1     13065               89.6
## 4 VS2     12258              103. 
## 5 VS1      8171              107. 
## 6 VVS2     5066              127. 
## 7 VVS1     3655              141. 
## 8 IF       1790              140.

  #Question 9 - The mean price per carat of diamonds over $10,000 is $8,044 per carat.
     diamonds%>%
       filter(price>10000)%>%
       mutate(pricepercarat = price/carat)%>%
       summarise(n=n(), 
                 meanpricepercarat = mean(pricepercarat))

## # A tibble: 1 x 2
##       n meanpricepercarat
##   <int>             <dbl>
## 1  5222             8044.

  #Question 10 - Clarity SI2 seems to be the most common clarity out of diamonds that cost more than $10000.
      diamonds%>%
        filter(price>10000)%>%
        group_by(clarity)%>%
        summarise(n=n())

## `summarise()` ungrouping output (override with `.groups` argument)

## # A tibble: 8 x 2
##   clarity     n
##   <ord>   <int>
## 1 I1         30
## 2 SI2      1239
## 3 SI1      1184
## 4 VS2      1155
## 5 VS1       747
## 6 VVS2      452
## 7 VVS1      247
## 8 IF        168

#Midterm Prt 3:Data Viz

  #view(ToothGrowth)
  #?ToothGrowth
  #str(ToothGrowth)
  
  #Question 1 - Rows in a data set represent observations for each participant. In this data set, rows are the condition level and DV measures for each Guinea Pigs.
      
  #Question 2 - Columns in data represent variables. In this data set, there are 3 variables: length (num, continuous), supplement (catagorical/factor), and dose (numeric, discrete but I could also treat it as a catagorical, ordinal depending on my question) 
      
  #Question 3 - Response is the length of thetooth (growth) while IVs are dose and supp
      
  #Question 4 - H0: That all groups will be equal in length (Dose 1 = Dose 2 and supp 1 = supp2)
      
  #Question 5  
      ggplot(ToothGrowth, aes(supp, len))+
        geom_boxplot()

  #Question 6   
      ggplot(ToothGrowth, aes(supp, len))+
        geom_boxplot()+
        facet_wrap(~dose)

  #Question 7 - It seems like as dose increases, tooth growth also increases. Additionally. OJ seems to work better than VC, but only up to a does of 2.0. This gives support to vitamin c aiding in tooth growth, with OJ being the prefered method up to a point.
      
  #Question 8 - The results do not fit my hypothesis because there were differences in tooth growth depeneding on the conditions. I would like to know more about what the different spreads mean in terms on this study.

Midterm1Part2and3

Maya_Hansen-Tilkens

9/24/2020