ARES40011 Rsrch Methods & Data Analysis
WEEK 1
Content for Week 1
WEEK 1
Week 3 Post Session: Chapter 6.6.1
Content for Week 3
Week 3 Post Session: Chapter 6.6.1
library(tidyverse)
midwest %>%
group_by(state) %>% #summarises:
summarize(poptotalmean = mean(poptotal), #mean pop of state
poptotalmed = median(poptotal), #median pop of state
popmax = max(poptotal), #maximum pop of state
popmin = min(poptotal), #minimum pop of state
popdistinct = n_distinct(poptotal), #outliers for pop of state
popfirst = first(poptotal), #initial pop of state
popany = any(poptotal < 5000), #True if pop less than 5000 in state
popany2 = any(poptotal > 2000000)) %>% #True if pop more than 2000000 in state
ungroup() #Shown as top rows# A tibble: 5 × 9
state poptotalmean poptotalmed popmax popmin popdistinct popfirst popany
<chr> <dbl> <dbl> <int> <int> <int> <int> <lgl>
1 IL 112065. 24486. 5105067 4373 101 66090 TRUE
2 IN 60263. 30362. 797159 5315 92 31095 FALSE
3 MI 111992. 37308 2111687 1701 83 10145 TRUE
4 OH 123263. 54930. 1412140 11098 88 25371 FALSE
5 WI 67941. 33528 959275 3890 72 15682 TRUE
# ℹ 1 more variable: popany2 <lgl>
midwest %>%
group_by(state) %>%
summarize(num5k = sum(poptotal < 5000), #number of places in state with pop less than 5000
num2mil = sum(poptotal > 2000000), #number of places in state with pop more than 2000000
numrows = n()) %>% #shows how many rows data was taken from
ungroup()# A tibble: 5 × 4
state num5k num2mil numrows
<chr> <int> <int> <int>
1 IL 1 1 102
2 IN 0 0 92
3 MI 1 1 83
4 OH 0 0 88
5 WI 2 0 72
midwest %>%
group_by(county) %>%
summarize(x = n_distinct(state)) %>% #number of unique values in each state
arrange(desc(x)) %>% #arranged in descending order
ungroup()# A tibble: 320 × 2
county x
<chr> <int>
1 CRAWFORD 5
2 JACKSON 5
3 MONROE 5
4 ADAMS 4
5 BROWN 4
6 CLARK 4
7 CLINTON 4
8 JEFFERSON 4
9 LAKE 4
10 WASHINGTON 4
# ℹ 310 more rows
midwest %>%
group_by(county) %>%
summarize(x = n()) %>%
ungroup()# A tibble: 320 × 2
county x
<chr> <int>
1 ADAMS 4
2 ALCONA 1
3 ALEXANDER 1
4 ALGER 1
5 ALLEGAN 1
6 ALLEN 2
7 ALPENA 1
8 ANTRIM 1
9 ARENAC 1
10 ASHLAND 2
# ℹ 310 more rows
#n_distinct counts the number of unique values rows for each group whereas, n() counts number of rows in each group
#would be the same result if data is not grouped and each value is unique or if the data is grouped they would show the same result if each group has a unique valuemidwest %>%
group_by(county) %>% #only 1 county per county none distinctly different
summarize(x = n_distinct(county)) %>% #expected as no countys within countys
ungroup()# A tibble: 320 × 2
county x
<chr> <int>
1 ADAMS 1
2 ALCONA 1
3 ALEXANDER 1
4 ALGER 1
5 ALLEGAN 1
6 ALLEN 1
7 ALPENA 1
8 ANTRIM 1
9 ARENAC 1
10 ASHLAND 1
# ℹ 310 more rows
#replacing county with state to see if different result
midwest %>%
group_by(county) %>% #different result now showing number of countys in each state
summarize(x = n_distinct(state)) %>%
ungroup() # A tibble: 320 × 2
county x
<chr> <int>
1 ADAMS 4
2 ALCONA 1
3 ALEXANDER 1
4 ALGER 1
5 ALLEGAN 1
6 ALLEN 2
7 ALPENA 1
8 ANTRIM 1
9 ARENAC 1
10 ASHLAND 2
# ℹ 310 more rows
#Problem D
diamonds %>%
group_by(clarity) %>% #group by clarity
summarize(a = n_distinct(color), #number of unique colours of each diamond for the clarity
b = n_distinct(price), #unique prices for diamonds for each clarity
c = n()) %>% #number of diamonds for each clarity
ungroup()# A tibble: 8 × 4
clarity a b c
<ord> <int> <int> <int>
1 I1 7 632 741
2 SI2 7 4904 9194
3 SI1 7 5380 13065
4 VS2 7 5051 12258
5 VS1 7 3926 8171
6 VVS2 7 2409 5066
7 VVS1 7 1623 3655
8 IF 7 902 1790
#Problem E: Part 1
diamonds %>%
group_by(color, cut) %>% #group by colour and cut
summarize(m = mean(price), #mean price for colour and cut
s = sd(price)) %>% #standard deviation for colour and cut
ungroup()`summarise()` has grouped output by 'color'. You can override using the
`.groups` argument.
# A tibble: 35 × 4
color cut m s
<ord> <ord> <dbl> <dbl>
1 D Fair 4291. 3286.
2 D Good 3405. 3175.
3 D Very Good 3470. 3524.
4 D Premium 3631. 3712.
5 D Ideal 2629. 3001.
6 E Fair 3682. 2977.
7 E Good 3424. 3331.
8 E Very Good 3215. 3408.
9 E Premium 3539. 3795.
10 E Ideal 2598. 2956.
# ℹ 25 more rows
#Part 2
diamonds %>%
group_by(cut, color) %>% #group by colour and cut
summarize(m = mean(price),
s = sd(price)) %>%
ungroup()`summarise()` has grouped output by 'cut'. You can override using the `.groups`
argument.
# A tibble: 35 × 4
cut color m s
<ord> <ord> <dbl> <dbl>
1 Fair D 4291. 3286.
2 Fair E 3682. 2977.
3 Fair F 3827. 3223.
4 Fair G 4239. 3610.
5 Fair H 5136. 3886.
6 Fair I 4685. 3730.
7 Fair J 4976. 4050.
8 Good D 3405. 3175.
9 Good E 3424. 3331.
10 Good F 3496. 3202.
# ℹ 25 more rows
#Part 3
diamonds %>%
group_by(cut, color, clarity) %>% #msale gives price with 20% off original price
summarize(m = mean(price),
s = sd(price),
msale = m * 0.80) %>%
ungroup()`summarise()` has grouped output by 'cut', 'color'. You can override using the
`.groups` argument.
# A tibble: 276 × 6
cut color clarity m s msale
<ord> <ord> <ord> <dbl> <dbl> <dbl>
1 Fair D I1 7383 5899. 5906.
2 Fair D SI2 4355. 3260. 3484.
3 Fair D SI1 4273. 3019. 3419.
4 Fair D VS2 4513. 3383. 3610.
5 Fair D VS1 2921. 2550. 2337.
6 Fair D VVS2 3607 3629. 2886.
7 Fair D VVS1 4473 5457. 3578.
8 Fair D IF 1620. 525. 1296.
9 Fair E I1 2095. 824. 1676.
10 Fair E SI2 4172. 3055. 3338.
# ℹ 266 more rows
#Problem F
diamonds %>%
group_by(cut) %>%
summarize(potato = mean(depth),
pizza = mean(price),
popcorn = median(y),
pineapple = potato - pizza,
papaya = pineapple ^ 2,
peach = n()) %>%
ungroup()# A tibble: 5 × 7
cut potato pizza popcorn pineapple papaya peach
<ord> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
1 Fair 64.0 4359. 6.1 -4295. 18444586. 1610
2 Good 62.4 3929. 5.99 -3866. 14949811. 4906
3 Very Good 61.8 3982. 5.77 -3920. 15365942. 12082
4 Premium 61.3 4584. 6.06 -4523. 20457466. 13791
5 Ideal 61.7 3458. 5.26 -3396. 11531679. 21551
#Problem G: Part 1
diamonds %>%
group_by(color) %>%
summarize(m = mean(price)) %>%
mutate(x1 = str_c("Diamond color ", color),
x2 = 5) %>%
ungroup()# A tibble: 7 × 4
color m x1 x2
<ord> <dbl> <chr> <dbl>
1 D 3170. Diamond color D 5
2 E 3077. Diamond color E 5
3 F 3725. Diamond color F 5
4 G 3999. Diamond color G 5
5 H 4487. Diamond color H 5
6 I 5092. Diamond color I 5
7 J 5324. Diamond color J 5
#Part 2
diamonds %>%
group_by(color) %>%
summarize(m = mean(price)) %>%
ungroup() %>% #ungroup to prevent errors in data management further along
mutate(x1 = str_c("Diamond color ", color), #no closing ungroup as the data was already regrouped
x2 = 5) # A tibble: 7 × 4
color m x1 x2
<ord> <dbl> <chr> <dbl>
1 D 3170. Diamond color D 5
2 E 3077. Diamond color E 5
3 F 3725. Diamond color F 5
4 G 3999. Diamond color G 5
5 H 4487. Diamond color H 5
6 I 5092. Diamond color I 5
7 J 5324. Diamond color J 5
#Problem H: Part 1
diamonds %>%
group_by(color) %>%
mutate(x1 = price * 0.5) %>%
summarize(m = mean(x1)) %>%
ungroup() # A tibble: 7 × 2
color m
<ord> <dbl>
1 D 1585.
2 E 1538.
3 F 1862.
4 G 2000.
5 H 2243.
6 I 2546.
7 J 2662.
#Part 2
diamonds %>%
group_by(color) %>%
mutate(x1 = price * 0.5) %>%
ungroup() %>%
summarize(m = mean(x1)) #Difference between part 1 and 2 is part 1 show mean 1/2 price for each colour and part 2 shows mean 1/2 price for for all diamonds# A tibble: 1 × 1
m
<dbl>
1 1966.
Why is grouping data necessary It allows specific variables to be used to increase accuraccy of results
Why is ungrouping data necessary? To prevent errors in data management in future
When should you ungroup data? Every time you use the group() command
If the code does not contain group_by(), do you still need ungroup() at the end? No as it is just creating a new column of data
Excercise 6.7
library(tidyverse)
library(tidyverse)
view(diamonds)
str(diamonds)tibble [53,940 × 10] (S3: tbl_df/tbl/data.frame)
$ carat : num [1:53940] 0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
$ cut : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
$ color : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
$ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
$ depth : num [1:53940] 61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
$ table : num [1:53940] 55 61 65 58 58 57 57 55 61 61 ...
$ price : int [1:53940] 326 326 327 334 335 336 336 337 337 338 ...
$ x : num [1:53940] 3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
$ y : num [1:53940] 3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
$ z : num [1:53940] 2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...
#Arrange by lowest price
diamonds %>%
arrange(price)# A tibble: 53,940 × 10
carat cut color clarity depth table price x y z
<dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
4 0.29 Premium I VS2 62.4 58 334 4.2 4.23 2.63
5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
7 0.24 Very Good I VVS1 62.3 57 336 3.95 3.98 2.47
8 0.26 Very Good H SI1 61.9 55 337 4.07 4.11 2.53
9 0.22 Fair E VS2 65.1 61 337 3.87 3.78 2.49
10 0.23 Very Good H VS1 59.4 61 338 4 4.05 2.39
# ℹ 53,930 more rows
view(diamonds)
#Arrange by highest
diamonds %>%
arrange(desc(price))# A tibble: 53,940 × 10
carat cut color clarity depth table price x y z
<dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
1 2.29 Premium I VS2 60.8 60 18823 8.5 8.47 5.16
2 2 Very Good G SI1 63.5 56 18818 7.9 7.97 5.04
3 1.51 Ideal G IF 61.7 55 18806 7.37 7.41 4.56
4 2.07 Ideal G SI2 62.5 55 18804 8.2 8.13 5.11
5 2 Very Good H SI1 62.8 57 18803 7.95 8 5.01
6 2.29 Premium I SI1 61.8 59 18797 8.52 8.45 5.24
7 2.04 Premium H SI1 58.1 60 18795 8.37 8.28 4.84
8 2 Premium I VS1 60.8 59 18795 8.13 8.02 4.91
9 1.71 Premium F VS2 62.3 59 18791 7.57 7.53 4.7
10 2.15 Ideal G SI2 62.6 54 18791 8.29 8.35 5.21
# ℹ 53,930 more rows
#Lowest Price and cut
diamonds %>%
arrange(price, cut)# A tibble: 53,940 × 10
carat cut color clarity depth table price x y z
<dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
1 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
2 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
4 0.29 Premium I VS2 62.4 58 334 4.2 4.23 2.63
5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
7 0.24 Very Good I VVS1 62.3 57 336 3.95 3.98 2.47
8 0.22 Fair E VS2 65.1 61 337 3.87 3.78 2.49
9 0.26 Very Good H SI1 61.9 55 337 4.07 4.11 2.53
10 0.23 Very Good H VS1 59.4 61 338 4 4.05 2.39
# ℹ 53,930 more rows
#Highest Price and Cut
diamonds %>%
arrange(desc(price),desc(cut))# A tibble: 53,940 × 10
carat cut color clarity depth table price x y z
<dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
1 2.29 Premium I VS2 60.8 60 18823 8.5 8.47 5.16
2 2 Very Good G SI1 63.5 56 18818 7.9 7.97 5.04
3 1.51 Ideal G IF 61.7 55 18806 7.37 7.41 4.56
4 2.07 Ideal G SI2 62.5 55 18804 8.2 8.13 5.11
5 2 Very Good H SI1 62.8 57 18803 7.95 8 5.01
6 2.29 Premium I SI1 61.8 59 18797 8.52 8.45 5.24
7 2.04 Premium H SI1 58.1 60 18795 8.37 8.28 4.84
8 2 Premium I VS1 60.8 59 18795 8.13 8.02 4.91
9 2.15 Ideal G SI2 62.6 54 18791 8.29 8.35 5.21
10 1.71 Premium F VS2 62.3 59 18791 7.57 7.53 4.7
# ℹ 53,930 more rows
#Arrange by worst price and worst clarity
diamonds %>%
arrange(price, clarity)# A tibble: 53,940 × 10
carat cut color clarity depth table price x y z
<dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
4 0.29 Premium I VS2 62.4 58 334 4.2 4.23 2.63
5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
7 0.24 Very Good I VVS1 62.3 57 336 3.95 3.98 2.47
8 0.26 Very Good H SI1 61.9 55 337 4.07 4.11 2.53
9 0.22 Fair E VS2 65.1 61 337 3.87 3.78 2.49
10 0.23 Very Good H VS1 59.4 61 338 4 4.05 2.39
# ℹ 53,930 more rows
#Sale price $250 off original price
diamonds %>%
mutate(salePrice = price - 250)# A tibble: 53,940 × 11
carat cut color clarity depth table price x y z salePrice
<dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <dbl>
1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43 76
2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31 76
3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31 77
4 0.29 Premium I VS2 62.4 58 334 4.2 4.23 2.63 84
5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75 85
6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48 86
7 0.24 Very Good I VVS1 62.3 57 336 3.95 3.98 2.47 86
8 0.26 Very Good H SI1 61.9 55 337 4.07 4.11 2.53 87
9 0.22 Fair E VS2 65.1 61 337 3.87 3.78 2.49 87
10 0.23 Very Good H VS1 59.4 61 338 4 4.05 2.39 88
# ℹ 53,930 more rows
#Remove x, y, z variables
diamonds %>%
select(-x, -y, -z)# A tibble: 53,940 × 7
carat cut color clarity depth table price
<dbl> <ord> <ord> <ord> <dbl> <dbl> <int>
1 0.23 Ideal E SI2 61.5 55 326
2 0.21 Premium E SI1 59.8 61 326
3 0.23 Good E VS1 56.9 65 327
4 0.29 Premium I VS2 62.4 58 334
5 0.31 Good J SI2 63.3 58 335
6 0.24 Very Good J VVS2 62.8 57 336
7 0.24 Very Good I VVS1 62.3 57 336
8 0.26 Very Good H SI1 61.9 55 337
9 0.22 Fair E VS2 65.1 61 337
10 0.23 Very Good H VS1 59.4 61 338
# ℹ 53,930 more rows
#Number of diamonds for each cut
diamonds %>%
group_by(cut) %>%
summarise(X = n()) %>%
ungroup()# A tibble: 5 × 2
cut X
<ord> <int>
1 Fair 1610
2 Good 4906
3 Very Good 12082
4 Premium 13791
5 Ideal 21551
#New Column for total number of diamonds
diamonds %>%
mutate(total_diamonds = nrow(diamonds))# A tibble: 53,940 × 11
carat cut color clarity depth table price x y z total_diamonds
<dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <int>
1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43 53940
2 0.21 Premi… E SI1 59.8 61 326 3.89 3.84 2.31 53940
3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31 53940
4 0.29 Premi… I VS2 62.4 58 334 4.2 4.23 2.63 53940
5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75 53940
6 0.24 Very … J VVS2 62.8 57 336 3.94 3.96 2.48 53940
7 0.24 Very … I VVS1 62.3 57 336 3.95 3.98 2.47 53940
8 0.26 Very … H SI1 61.9 55 337 4.07 4.11 2.53 53940
9 0.22 Fair E VS2 65.1 61 337 3.87 3.78 2.49 53940
10 0.23 Very … H VS1 59.4 61 338 4 4.05 2.39 53940
# ℹ 53,930 more rows
Research Method
Bad Question:
Do diamonds with a better clarity cost more?
Good Question:
How significant are the diamond variables in determining the diamonds price?
Week 4
Content for Week 4
Week 4
library(tidyverse)
if (!requireNamespace("modeldata", quietly = TRUE)) {
install.packages("modeldata")
}
library(modeldata)
Attaching package: 'modeldata'
The following object is masked from 'package:palmerpenguins':
penguins
#R was having an issue with diamonds data and ggplot for this code so had to remove
library(modeldata)
data(crickets)
head(crickets)# A tibble: 6 × 3
species temp rate
<fct> <dbl> <dbl>
1 O. exclamationis 20.8 67.9
2 O. exclamationis 20.8 65.1
3 O. exclamationis 24 77.3
4 O. exclamationis 24 78.7
5 O. exclamationis 24 79.4
6 O. exclamationis 24 80.4
library(ggplot2)
# The basics
ggplot(crickets, aes(x = temp,
y = rate)) +
geom_point() +
labs(x = "Temperature",
y = "Chirp rate",
title = "Cricket chirps",
caption = "Source: McDonald (2009)")ggplot(crickets, aes(x = temp,
y = rate,
color = species)) +
geom_point() +
labs(x = "Temperature",
y = "Chirp rate",
color = "Species",
title = "Cricket chirps",
caption = "Source: McDonald (2009)") +
scale_color_brewer(palette = "Dark2")# Modifiying basic properties of the plot
ggplot(crickets, aes(x = temp,
y = rate)) +
geom_point(color = "red",
size = 2,
alpha = .3,
shape = "square") +
labs(x = "Temperature",
y = "Chirp rate",
title = "Cricket chirps",
caption = "Source: McDonald (2009)")# Learn more about the options for the geom_abline()
# with ?geom_point
# Adding another layer
ggplot(crickets, aes(x = temp,
y = rate)) +
geom_point() +
geom_smooth(method = "lm",
se = FALSE) +
labs(x = "Temperature",
y = "Chirp rate",
title = "Cricket chirps",
caption = "Source: McDonald (2009)")`geom_smooth()` using formula = 'y ~ x'
ggplot(crickets, aes(x = temp,
y = rate,
color = species)) +
geom_point() +
geom_smooth(method = "lm",
se = FALSE) +
labs(x = "Temperature",
y = "Chirp rate",
color = "Species",
title = "Cricket chirps",
caption = "Source: McDonald (2009)") +
scale_color_brewer(palette = "Dark2") `geom_smooth()` using formula = 'y ~ x'
# Other plots
ggplot(crickets, aes(x = rate)) +
geom_histogram(bins = 15) # one quantitative variableggplot(crickets, aes(x = rate)) +
geom_freqpoly(bins = 15)ggplot(crickets, aes(x = species)) +
geom_bar(color = "black",
fill = "lightblue")ggplot(crickets, aes(x = species,
fill = species)) +
geom_bar(show.legend = FALSE) +
scale_fill_brewer(palette = "Dark2")ggplot(crickets, aes(x = species,
y = rate,
color = species)) +
geom_boxplot(show.legend = FALSE) +
scale_color_brewer(palette = "Dark2") +
theme_minimal()# faceting
# not great:
ggplot(crickets, aes(x = rate,
fill = species)) +
geom_histogram(bins = 15) +
scale_fill_brewer(palette = "Dark2")ggplot(crickets, aes(x = rate,
fill = species)) +
geom_histogram(bins = 15,
show.legend = FALSE) +
facet_wrap(~species) +
scale_fill_brewer(palette = "Dark2")ggplot(crickets, aes(x = rate,
fill = species)) +
geom_histogram(bins = 15,
show.legend = FALSE) +
facet_wrap(~species,
ncol = 1) +
scale_fill_brewer(palette = "Dark2") +
theme_minimal()What Makes a Good Hypothesis?
A good hypothesis should have a clear and concise aim that is testable predicting an effect with variables using theories/previous research
Week 5 Post Session
An ANOVA would help explain the data as it is comparing petal length of 3 species and as the data is showing to be normally distributed linear regression could also be used as the petal length is a continuous variable.
Recreating Graphs
library(ggplot2)
data(iris)
ggplot(iris, aes(x = Species, y = Petal.Length)) +
geom_boxplot() +
labs(title = "Boxplot of Sepal Length by Species",
x = "Species",
y = "Sepal.Length") +
theme_minimal()ggplot(iris, aes(x = Petal.Length, fill = Species)) +
geom_density(alpha = 0.5) +
labs(title = "Density Histogram of Petal Length",
x = "Petal Length (cm)",
y = "Density") +
theme_minimal()ggplot(iris, aes(x = Petal.Length, y = Petal.Width, color = Species)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "blue") +
labs(title = "Scatter Plot of Petal Length vs Petal Width",
x = "Petal Length (cm)",
y = "Petal Width (cm)",
color = "Species") +
theme_minimal()`geom_smooth()` using formula = 'y ~ x'
#This took awhile to figure out
data("iris")
iris %>%
mutate(size = ifelse(Sepal.Length < median(Sepal.Length), "small", "big")) %>%
ggplot(aes(x = Species, fill = size)) +
geom_bar(position = "dodge") +
theme_minimal() +
labs(y = "count", x = "Species")