ARES40011 Rsrch Methods & Data Analysis

WEEK 1

Content for Week 1

WEEK 1

Week 3 Post Session: Chapter 6.6.1

Content for Week 3

Week 3 Post Session: Chapter 6.6.1

library(tidyverse)
midwest %>% 
  group_by(state) %>%                            #summarises:
  summarize(poptotalmean = mean(poptotal),       #mean pop of state
            poptotalmed = median(poptotal),      #median pop of state
            popmax = max(poptotal),              #maximum pop of state
            popmin = min(poptotal),              #minimum pop of state
            popdistinct = n_distinct(poptotal),  #outliers for pop of state
            popfirst = first(poptotal),          #initial pop of state
            popany = any(poptotal < 5000),       #True if pop less than 5000 in state
            popany2 = any(poptotal > 2000000)) %>%  #True if pop more than 2000000 in state
  ungroup()                                        #Shown as top rows

# A tibble: 5 × 9
  state poptotalmean poptotalmed  popmax popmin popdistinct popfirst popany
  <chr>        <dbl>       <dbl>   <int>  <int>       <int>    <int> <lgl> 
1 IL         112065.      24486. 5105067   4373         101    66090 TRUE  
2 IN          60263.      30362.  797159   5315          92    31095 FALSE 
3 MI         111992.      37308  2111687   1701          83    10145 TRUE  
4 OH         123263.      54930. 1412140  11098          88    25371 FALSE 
5 WI          67941.      33528   959275   3890          72    15682 TRUE  
# ℹ 1 more variable: popany2 <lgl>

midwest %>% 
  group_by(state) %>% 
  summarize(num5k = sum(poptotal < 5000),    #number of places in state with pop less than 5000
            num2mil = sum(poptotal > 2000000), #number of places in state with pop more than 2000000
            numrows = n()) %>%                #shows how many rows data was taken from 
  ungroup()

# A tibble: 5 × 4
  state num5k num2mil numrows
  <chr> <int>   <int>   <int>
1 IL        1       1     102
2 IN        0       0      92
3 MI        1       1      83
4 OH        0       0      88
5 WI        2       0      72

midwest %>% 
  group_by(county) %>%   
  summarize(x = n_distinct(state)) %>%  #number of unique values in each state
  arrange(desc(x)) %>%                  #arranged in descending order
  ungroup()

# A tibble: 320 × 2
   county         x
   <chr>      <int>
 1 CRAWFORD       5
 2 JACKSON        5
 3 MONROE         5
 4 ADAMS          4
 5 BROWN          4
 6 CLARK          4
 7 CLINTON        4
 8 JEFFERSON      4
 9 LAKE           4
10 WASHINGTON     4
# ℹ 310 more rows

midwest %>% 
  group_by(county) %>% 
  summarize(x = n()) %>%
  ungroup()

# A tibble: 320 × 2
   county        x
   <chr>     <int>
 1 ADAMS         4
 2 ALCONA        1
 3 ALEXANDER     1
 4 ALGER         1
 5 ALLEGAN       1
 6 ALLEN         2
 7 ALPENA        1
 8 ANTRIM        1
 9 ARENAC        1
10 ASHLAND       2
# ℹ 310 more rows

#n_distinct counts the number of unique values rows for each group whereas, n() counts number of rows in each group 
#would be the same result if data is not grouped and each value is unique or if the data is grouped they would show the same result if each group has a unique value

midwest %>% 
  group_by(county) %>%     #only 1 county per county none distinctly different
  summarize(x = n_distinct(county)) %>% #expected as no countys within countys
  ungroup()

# A tibble: 320 × 2
   county        x
   <chr>     <int>
 1 ADAMS         1
 2 ALCONA        1
 3 ALEXANDER     1
 4 ALGER         1
 5 ALLEGAN       1
 6 ALLEN         1
 7 ALPENA        1
 8 ANTRIM        1
 9 ARENAC        1
10 ASHLAND       1
# ℹ 310 more rows

#replacing county with state to see if different result

midwest %>% 
  group_by(county) %>%                     #different result now showing number of countys in each state
  summarize(x = n_distinct(state)) %>% 
  ungroup()

# A tibble: 320 × 2
   county        x
   <chr>     <int>
 1 ADAMS         4
 2 ALCONA        1
 3 ALEXANDER     1
 4 ALGER         1
 5 ALLEGAN       1
 6 ALLEN         2
 7 ALPENA        1
 8 ANTRIM        1
 9 ARENAC        1
10 ASHLAND       2
# ℹ 310 more rows

#Problem D
diamonds %>% 
  group_by(clarity) %>%       #group by clarity 
  summarize(a = n_distinct(color), #number of unique colours of each diamond for the clarity
            b = n_distinct(price), #unique prices for diamonds for each clarity
            c = n()) %>%     #number of diamonds for each clarity
  ungroup()

# A tibble: 8 × 4
  clarity     a     b     c
  <ord>   <int> <int> <int>
1 I1          7   632   741
2 SI2         7  4904  9194
3 SI1         7  5380 13065
4 VS2         7  5051 12258
5 VS1         7  3926  8171
6 VVS2        7  2409  5066
7 VVS1        7  1623  3655
8 IF          7   902  1790

#Problem E: Part 1

diamonds %>% 
  group_by(color, cut) %>%  #group by colour and cut
  summarize(m = mean(price), #mean price for colour and cut
            s = sd(price)) %>%  #standard deviation for colour and cut 
  ungroup()

`summarise()` has grouped output by 'color'. You can override using the
`.groups` argument.

# A tibble: 35 × 4
   color cut           m     s
   <ord> <ord>     <dbl> <dbl>
 1 D     Fair      4291. 3286.
 2 D     Good      3405. 3175.
 3 D     Very Good 3470. 3524.
 4 D     Premium   3631. 3712.
 5 D     Ideal     2629. 3001.
 6 E     Fair      3682. 2977.
 7 E     Good      3424. 3331.
 8 E     Very Good 3215. 3408.
 9 E     Premium   3539. 3795.
10 E     Ideal     2598. 2956.
# ℹ 25 more rows

#Part 2

diamonds %>% 
  group_by(cut, color) %>%   #group by colour and cut
  summarize(m = mean(price), 
            s = sd(price)) %>% 
  ungroup()

`summarise()` has grouped output by 'cut'. You can override using the `.groups`
argument.

# A tibble: 35 × 4
   cut   color     m     s
   <ord> <ord> <dbl> <dbl>
 1 Fair  D     4291. 3286.
 2 Fair  E     3682. 2977.
 3 Fair  F     3827. 3223.
 4 Fair  G     4239. 3610.
 5 Fair  H     5136. 3886.
 6 Fair  I     4685. 3730.
 7 Fair  J     4976. 4050.
 8 Good  D     3405. 3175.
 9 Good  E     3424. 3331.
10 Good  F     3496. 3202.
# ℹ 25 more rows

#Part 3

diamonds %>% 
  group_by(cut, color, clarity) %>%  #msale gives price with 20% off original price
  summarize(m = mean(price),
            s = sd(price),
            msale = m * 0.80) %>% 
  ungroup()

`summarise()` has grouped output by 'cut', 'color'. You can override using the
`.groups` argument.

# A tibble: 276 × 6
   cut   color clarity     m     s msale
   <ord> <ord> <ord>   <dbl> <dbl> <dbl>
 1 Fair  D     I1      7383  5899. 5906.
 2 Fair  D     SI2     4355. 3260. 3484.
 3 Fair  D     SI1     4273. 3019. 3419.
 4 Fair  D     VS2     4513. 3383. 3610.
 5 Fair  D     VS1     2921. 2550. 2337.
 6 Fair  D     VVS2    3607  3629. 2886.
 7 Fair  D     VVS1    4473  5457. 3578.
 8 Fair  D     IF      1620.  525. 1296.
 9 Fair  E     I1      2095.  824. 1676.
10 Fair  E     SI2     4172. 3055. 3338.
# ℹ 266 more rows

#Problem F

diamonds %>% 
  group_by(cut) %>% 
  summarize(potato = mean(depth),  
            pizza = mean(price),
            popcorn = median(y),
            pineapple = potato - pizza,
            papaya = pineapple ^ 2,
            peach = n()) %>% 
  ungroup()

# A tibble: 5 × 7
  cut       potato pizza popcorn pineapple    papaya peach
  <ord>      <dbl> <dbl>   <dbl>     <dbl>     <dbl> <int>
1 Fair        64.0 4359.    6.1     -4295. 18444586.  1610
2 Good        62.4 3929.    5.99    -3866. 14949811.  4906
3 Very Good   61.8 3982.    5.77    -3920. 15365942. 12082
4 Premium     61.3 4584.    6.06    -4523. 20457466. 13791
5 Ideal       61.7 3458.    5.26    -3396. 11531679. 21551

#Problem G: Part 1

diamonds %>% 
  group_by(color) %>% 
  summarize(m = mean(price)) %>% 
  mutate(x1 = str_c("Diamond color ", color),
         x2 = 5) %>% 
  ungroup()

# A tibble: 7 × 4
  color     m x1                 x2
  <ord> <dbl> <chr>           <dbl>
1 D     3170. Diamond color D     5
2 E     3077. Diamond color E     5
3 F     3725. Diamond color F     5
4 G     3999. Diamond color G     5
5 H     4487. Diamond color H     5
6 I     5092. Diamond color I     5
7 J     5324. Diamond color J     5

#Part 2

diamonds %>% 
  group_by(color) %>% 
  summarize(m = mean(price)) %>% 
  ungroup() %>%                                     #ungroup to prevent errors in data management further along
  mutate(x1 = str_c("Diamond color ", color),       #no closing ungroup as the data was already regrouped
         x2 = 5)

# A tibble: 7 × 4
  color     m x1                 x2
  <ord> <dbl> <chr>           <dbl>
1 D     3170. Diamond color D     5
2 E     3077. Diamond color E     5
3 F     3725. Diamond color F     5
4 G     3999. Diamond color G     5
5 H     4487. Diamond color H     5
6 I     5092. Diamond color I     5
7 J     5324. Diamond color J     5

#Problem H: Part 1

diamonds %>% 
  group_by(color) %>% 
  mutate(x1 = price * 0.5) %>% 
  summarize(m = mean(x1)) %>% 
  ungroup()

# A tibble: 7 × 2
  color     m
  <ord> <dbl>
1 D     1585.
2 E     1538.
3 F     1862.
4 G     2000.
5 H     2243.
6 I     2546.
7 J     2662.

#Part 2
diamonds %>% 
  group_by(color) %>% 
  mutate(x1 = price * 0.5) %>% 
  ungroup() %>%  
  summarize(m = mean(x1))    #Difference between part 1 and 2 is part 1 show mean 1/2 price for each colour and part 2 shows mean 1/2 price for for all diamonds

# A tibble: 1 × 1
      m
  <dbl>
1 1966.

Why is grouping data necessary It allows specific variables to be used to increase accuraccy of results

Why is ungrouping data necessary? To prevent errors in data management in future

When should you ungroup data? Every time you use the group() command

If the code does not contain group_by(), do you still need ungroup() at the end? No as it is just creating a new column of data

Excercise 6.7

library(tidyverse)
library(tidyverse)
view(diamonds)
str(diamonds)

tibble [53,940 × 10] (S3: tbl_df/tbl/data.frame)
 $ carat  : num [1:53940] 0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
 $ cut    : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
 $ color  : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
 $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
 $ depth  : num [1:53940] 61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
 $ table  : num [1:53940] 55 61 65 58 58 57 57 55 61 61 ...
 $ price  : int [1:53940] 326 326 327 334 335 336 336 337 337 338 ...
 $ x      : num [1:53940] 3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
 $ y      : num [1:53940] 3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
 $ z      : num [1:53940] 2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...

#Arrange by lowest price
diamonds %>%
  arrange(price)

# A tibble: 53,940 × 10
   carat cut       color clarity depth table price     x     y     z
   <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
 1  0.23 Ideal     E     SI2      61.5    55   326  3.95  3.98  2.43
 2  0.21 Premium   E     SI1      59.8    61   326  3.89  3.84  2.31
 3  0.23 Good      E     VS1      56.9    65   327  4.05  4.07  2.31
 4  0.29 Premium   I     VS2      62.4    58   334  4.2   4.23  2.63
 5  0.31 Good      J     SI2      63.3    58   335  4.34  4.35  2.75
 6  0.24 Very Good J     VVS2     62.8    57   336  3.94  3.96  2.48
 7  0.24 Very Good I     VVS1     62.3    57   336  3.95  3.98  2.47
 8  0.26 Very Good H     SI1      61.9    55   337  4.07  4.11  2.53
 9  0.22 Fair      E     VS2      65.1    61   337  3.87  3.78  2.49
10  0.23 Very Good H     VS1      59.4    61   338  4     4.05  2.39
# ℹ 53,930 more rows

view(diamonds)

#Arrange by highest
diamonds %>%
  arrange(desc(price))

# A tibble: 53,940 × 10
   carat cut       color clarity depth table price     x     y     z
   <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
 1  2.29 Premium   I     VS2      60.8    60 18823  8.5   8.47  5.16
 2  2    Very Good G     SI1      63.5    56 18818  7.9   7.97  5.04
 3  1.51 Ideal     G     IF       61.7    55 18806  7.37  7.41  4.56
 4  2.07 Ideal     G     SI2      62.5    55 18804  8.2   8.13  5.11
 5  2    Very Good H     SI1      62.8    57 18803  7.95  8     5.01
 6  2.29 Premium   I     SI1      61.8    59 18797  8.52  8.45  5.24
 7  2.04 Premium   H     SI1      58.1    60 18795  8.37  8.28  4.84
 8  2    Premium   I     VS1      60.8    59 18795  8.13  8.02  4.91
 9  1.71 Premium   F     VS2      62.3    59 18791  7.57  7.53  4.7 
10  2.15 Ideal     G     SI2      62.6    54 18791  8.29  8.35  5.21
# ℹ 53,930 more rows

#Lowest Price and cut
diamonds %>%
  arrange(price, cut)

# A tibble: 53,940 × 10
   carat cut       color clarity depth table price     x     y     z
   <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
 1  0.21 Premium   E     SI1      59.8    61   326  3.89  3.84  2.31
 2  0.23 Ideal     E     SI2      61.5    55   326  3.95  3.98  2.43
 3  0.23 Good      E     VS1      56.9    65   327  4.05  4.07  2.31
 4  0.29 Premium   I     VS2      62.4    58   334  4.2   4.23  2.63
 5  0.31 Good      J     SI2      63.3    58   335  4.34  4.35  2.75
 6  0.24 Very Good J     VVS2     62.8    57   336  3.94  3.96  2.48
 7  0.24 Very Good I     VVS1     62.3    57   336  3.95  3.98  2.47
 8  0.22 Fair      E     VS2      65.1    61   337  3.87  3.78  2.49
 9  0.26 Very Good H     SI1      61.9    55   337  4.07  4.11  2.53
10  0.23 Very Good H     VS1      59.4    61   338  4     4.05  2.39
# ℹ 53,930 more rows

#Highest Price and Cut
diamonds %>%
  arrange(desc(price),desc(cut))

# A tibble: 53,940 × 10
   carat cut       color clarity depth table price     x     y     z
   <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
 1  2.29 Premium   I     VS2      60.8    60 18823  8.5   8.47  5.16
 2  2    Very Good G     SI1      63.5    56 18818  7.9   7.97  5.04
 3  1.51 Ideal     G     IF       61.7    55 18806  7.37  7.41  4.56
 4  2.07 Ideal     G     SI2      62.5    55 18804  8.2   8.13  5.11
 5  2    Very Good H     SI1      62.8    57 18803  7.95  8     5.01
 6  2.29 Premium   I     SI1      61.8    59 18797  8.52  8.45  5.24
 7  2.04 Premium   H     SI1      58.1    60 18795  8.37  8.28  4.84
 8  2    Premium   I     VS1      60.8    59 18795  8.13  8.02  4.91
 9  2.15 Ideal     G     SI2      62.6    54 18791  8.29  8.35  5.21
10  1.71 Premium   F     VS2      62.3    59 18791  7.57  7.53  4.7 
# ℹ 53,930 more rows

#Arrange by worst price and worst clarity
diamonds %>%
  arrange(price, clarity)

# A tibble: 53,940 × 10
   carat cut       color clarity depth table price     x     y     z
   <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
 1  0.23 Ideal     E     SI2      61.5    55   326  3.95  3.98  2.43
 2  0.21 Premium   E     SI1      59.8    61   326  3.89  3.84  2.31
 3  0.23 Good      E     VS1      56.9    65   327  4.05  4.07  2.31
 4  0.29 Premium   I     VS2      62.4    58   334  4.2   4.23  2.63
 5  0.31 Good      J     SI2      63.3    58   335  4.34  4.35  2.75
 6  0.24 Very Good J     VVS2     62.8    57   336  3.94  3.96  2.48
 7  0.24 Very Good I     VVS1     62.3    57   336  3.95  3.98  2.47
 8  0.26 Very Good H     SI1      61.9    55   337  4.07  4.11  2.53
 9  0.22 Fair      E     VS2      65.1    61   337  3.87  3.78  2.49
10  0.23 Very Good H     VS1      59.4    61   338  4     4.05  2.39
# ℹ 53,930 more rows

#Sale price $250 off original price
diamonds %>%
  mutate(salePrice = price - 250)

# A tibble: 53,940 × 11
   carat cut       color clarity depth table price     x     y     z salePrice
   <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>     <dbl>
 1  0.23 Ideal     E     SI2      61.5    55   326  3.95  3.98  2.43        76
 2  0.21 Premium   E     SI1      59.8    61   326  3.89  3.84  2.31        76
 3  0.23 Good      E     VS1      56.9    65   327  4.05  4.07  2.31        77
 4  0.29 Premium   I     VS2      62.4    58   334  4.2   4.23  2.63        84
 5  0.31 Good      J     SI2      63.3    58   335  4.34  4.35  2.75        85
 6  0.24 Very Good J     VVS2     62.8    57   336  3.94  3.96  2.48        86
 7  0.24 Very Good I     VVS1     62.3    57   336  3.95  3.98  2.47        86
 8  0.26 Very Good H     SI1      61.9    55   337  4.07  4.11  2.53        87
 9  0.22 Fair      E     VS2      65.1    61   337  3.87  3.78  2.49        87
10  0.23 Very Good H     VS1      59.4    61   338  4     4.05  2.39        88
# ℹ 53,930 more rows

#Remove x, y, z variables
diamonds %>%
  select(-x, -y, -z)

# A tibble: 53,940 × 7
   carat cut       color clarity depth table price
   <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int>
 1  0.23 Ideal     E     SI2      61.5    55   326
 2  0.21 Premium   E     SI1      59.8    61   326
 3  0.23 Good      E     VS1      56.9    65   327
 4  0.29 Premium   I     VS2      62.4    58   334
 5  0.31 Good      J     SI2      63.3    58   335
 6  0.24 Very Good J     VVS2     62.8    57   336
 7  0.24 Very Good I     VVS1     62.3    57   336
 8  0.26 Very Good H     SI1      61.9    55   337
 9  0.22 Fair      E     VS2      65.1    61   337
10  0.23 Very Good H     VS1      59.4    61   338
# ℹ 53,930 more rows

#Number of diamonds for each cut
diamonds %>%
  group_by(cut) %>%
  summarise(X = n()) %>%
  ungroup()

# A tibble: 5 × 2
  cut           X
  <ord>     <int>
1 Fair       1610
2 Good       4906
3 Very Good 12082
4 Premium   13791
5 Ideal     21551

#New Column for total number of diamonds
diamonds %>%
  mutate(total_diamonds = nrow(diamonds))

# A tibble: 53,940 × 11
   carat cut    color clarity depth table price     x     y     z total_diamonds
   <dbl> <ord>  <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>          <int>
 1  0.23 Ideal  E     SI2      61.5    55   326  3.95  3.98  2.43          53940
 2  0.21 Premi… E     SI1      59.8    61   326  3.89  3.84  2.31          53940
 3  0.23 Good   E     VS1      56.9    65   327  4.05  4.07  2.31          53940
 4  0.29 Premi… I     VS2      62.4    58   334  4.2   4.23  2.63          53940
 5  0.31 Good   J     SI2      63.3    58   335  4.34  4.35  2.75          53940
 6  0.24 Very … J     VVS2     62.8    57   336  3.94  3.96  2.48          53940
 7  0.24 Very … I     VVS1     62.3    57   336  3.95  3.98  2.47          53940
 8  0.26 Very … H     SI1      61.9    55   337  4.07  4.11  2.53          53940
 9  0.22 Fair   E     VS2      65.1    61   337  3.87  3.78  2.49          53940
10  0.23 Very … H     VS1      59.4    61   338  4     4.05  2.39          53940
# ℹ 53,930 more rows

Research Method

Bad Question:

Do diamonds with a better clarity cost more?

Good Question:

How significant are the diamond variables in determining the diamonds price?

Week 4

Content for Week 4

Week 4

library(tidyverse)
if (!requireNamespace("modeldata", quietly = TRUE)) {
  install.packages("modeldata")
}
library(modeldata)


Attaching package: 'modeldata'

The following object is masked from 'package:palmerpenguins':

    penguins

 #R was having an issue with diamonds data and ggplot for this code so had to remove 
library(modeldata)
data(crickets)
head(crickets)

# A tibble: 6 × 3
  species           temp  rate
  <fct>            <dbl> <dbl>
1 O. exclamationis  20.8  67.9
2 O. exclamationis  20.8  65.1
3 O. exclamationis  24    77.3
4 O. exclamationis  24    78.7
5 O. exclamationis  24    79.4
6 O. exclamationis  24    80.4

library(ggplot2)


# The basics

ggplot(crickets, aes(x = temp, 
                     y = rate)) + 
  geom_point() +
  labs(x = "Temperature",
       y = "Chirp rate",
       title = "Cricket chirps",
       caption = "Source: McDonald (2009)")

ggplot(crickets, aes(x = temp, 
                     y = rate,
                     color = species)) + 
  geom_point() +
  labs(x = "Temperature",
       y = "Chirp rate",
       color = "Species",
       title = "Cricket chirps",
       caption = "Source: McDonald (2009)") +
  scale_color_brewer(palette = "Dark2")

# Modifiying basic properties of the plot

ggplot(crickets, aes(x = temp, 
                     y = rate)) + 
  geom_point(color = "red",
             size = 2,
             alpha = .3,
             shape = "square") +
  labs(x = "Temperature",
       y = "Chirp rate",
       title = "Cricket chirps",
       caption = "Source: McDonald (2009)")

# Learn more about the options for the geom_abline()
# with ?geom_point

# Adding another layer


ggplot(crickets, aes(x = temp, 
                     y = rate)) + 
  geom_point() +
  geom_smooth(method = "lm",
              se = FALSE) +
  labs(x = "Temperature",
       y = "Chirp rate",
       title = "Cricket chirps",
       caption = "Source: McDonald (2009)")

`geom_smooth()` using formula = 'y ~ x'

ggplot(crickets, aes(x = temp, 
                     y = rate,
                     color = species)) + 
  geom_point() +
  geom_smooth(method = "lm",
              se = FALSE) +
  labs(x = "Temperature",
       y = "Chirp rate",
       color = "Species",
       title = "Cricket chirps",
       caption = "Source: McDonald (2009)") +
  scale_color_brewer(palette = "Dark2")

`geom_smooth()` using formula = 'y ~ x'

# Other plots

ggplot(crickets, aes(x = rate)) + 
  geom_histogram(bins = 15) # one quantitative variable

ggplot(crickets, aes(x = rate)) + 
  geom_freqpoly(bins = 15)

ggplot(crickets, aes(x = species)) + 
  geom_bar(color = "black",
           fill = "lightblue")

ggplot(crickets, aes(x = species, 
                     fill = species)) + 
  geom_bar(show.legend = FALSE) +
  scale_fill_brewer(palette = "Dark2")

ggplot(crickets, aes(x = species, 
                     y = rate,
                     color = species)) + 
  geom_boxplot(show.legend = FALSE) +
  scale_color_brewer(palette = "Dark2") +
  theme_minimal()

# faceting

# not great:
ggplot(crickets, aes(x = rate, 
                     fill = species)) + 
  geom_histogram(bins = 15) +
  scale_fill_brewer(palette = "Dark2")

ggplot(crickets, aes(x = rate,
                     fill = species)) + 
  geom_histogram(bins = 15,
                 show.legend = FALSE) + 
  facet_wrap(~species) +
  scale_fill_brewer(palette = "Dark2")

ggplot(crickets, aes(x = rate,
                     fill = species)) + 
  geom_histogram(bins = 15,
                 show.legend = FALSE) + 
  facet_wrap(~species,
             ncol = 1) +
  scale_fill_brewer(palette = "Dark2") + 
  theme_minimal()

What Makes a Good Hypothesis?

A good hypothesis should have a clear and concise aim that is testable predicting an effect with variables using theories/previous research

Week 5 Post Session

An ANOVA would help explain the data as it is comparing petal length of 3 species and as the data is showing to be normally distributed linear regression could also be used as the petal length is a continuous variable.

Recreating Graphs

library(ggplot2)

data(iris)

ggplot(iris, aes(x = Species, y = Petal.Length)) +
  geom_boxplot() +
  labs(title = "Boxplot of Sepal Length by Species",
       x = "Species",
       y = "Sepal.Length") +
  theme_minimal()

ggplot(iris, aes(x = Petal.Length, fill = Species)) +
  geom_density(alpha = 0.5) +
  labs(title = "Density Histogram of Petal Length",
       x = "Petal Length (cm)",
       y = "Density") +
  theme_minimal()

ggplot(iris, aes(x = Petal.Length, y = Petal.Width, color = Species)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +  
  labs(title = "Scatter Plot of Petal Length vs Petal Width",
       x = "Petal Length (cm)",
       y = "Petal Width (cm)",
       color = "Species") +
  theme_minimal()

`geom_smooth()` using formula = 'y ~ x'

#This took awhile to figure out
data("iris")
iris %>%
  mutate(size = ifelse(Sepal.Length < median(Sepal.Length), "small", "big")) %>%
  ggplot(aes(x = Species, fill = size)) +
  geom_bar(position = "dodge") +  
  theme_minimal() +  
  labs(y = "count", x = "Species")