Helpful Hints:
{r} and.Try replacing your βoutputβ code with the output code below (remove all hashtags)
More themes can be found here
# output:
# html_document:
# theme: journal
# toc: yes
# toc_float:
# collapsed: true
# Install the tidyverse if you don't have it yet
# (Uncomment the line below if needed)
#install.packages("tidyverse")
# Load tidyverse which includes dplyr, ggplot2, readr, etc.
library(tidyverse)
## ββ Attaching core tidyverse packages ββββββββββββββββββββββββ tidyverse 2.0.0 ββ
## β dplyr 1.2.0 β readr 2.2.0
## β forcats 1.0.1 β stringr 1.6.0
## β ggplot2 4.0.2 β tibble 3.3.1
## β lubridate 1.9.5 β tidyr 1.3.2
## β purrr 1.2.1
## ββ Conflicts ββββββββββββββββββββββββββββββββββββββββββ tidyverse_conflicts() ββ
## β dplyr::filter() masks stats::filter()
## β dplyr::lag() masks stats::lag()
## βΉ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
help("ggplot2")
citation("tidyverse")
## To cite package 'tidyverse' in publications use:
##
## Wickham H, Averick M, Bryan J, Chang W, McGowan LD, FranΓ§ois R,
## Grolemund G, Hayes A, Henry L, Hester J, Kuhn M, Pedersen TL, Miller
## E, Bache SM, MΓΌller K, Ooms J, Robinson D, Seidel DP, Spinu V,
## Takahashi K, Vaughan D, Wilke C, Woo K, Yutani H (2019). "Welcome to
## the tidyverse." _Journal of Open Source Software_, *4*(43), 1686.
## doi:10.21105/joss.01686 <https://doi.org/10.21105/joss.01686>.
##
## A BibTeX entry for LaTeX users is
##
## @Article{,
## title = {Welcome to the {tidyverse}},
## author = {Hadley Wickham and Mara Averick and Jennifer Bryan and Winston Chang and Lucy D'Agostino McGowan and Romain FranΓ§ois and Garrett Grolemund and Alex Hayes and Lionel Henry and Jim Hester and Max Kuhn and Thomas Lin Pedersen and Evan Miller and Stephan Milton Bache and Kirill MΓΌller and Jeroen Ooms and David Robinson and Dana Paige Seidel and Vitalie Spinu and Kohske Takahashi and Davis Vaughan and Claus Wilke and Kara Woo and Hiroaki Yutani},
## year = {2019},
## journal = {Journal of Open Source Software},
## volume = {4},
## number = {43},
## pages = {1686},
## doi = {10.21105/joss.01686},
## }
# Load a CSV file from the web
url <- "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/tips.csv"
tips <- read_csv(url)
## Rows: 244 Columns: 7
## ββ Column specification ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
## Delimiter: ","
## chr (4): sex, smoker, day, time
## dbl (3): total_bill, tip, size
##
## βΉ Use `spec()` to retrieve the full column specification for this data.
## βΉ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# data=read.csv("testdata.csv")
# Look at the data
head(tips) # View the first few rows
## # A tibble: 6 Γ 7
## total_bill tip sex smoker day time size
## <dbl> <dbl> <chr> <chr> <chr> <chr> <dbl>
## 1 17.0 1.01 Female No Sun Dinner 2
## 2 10.3 1.66 Male No Sun Dinner 3
## 3 21.0 3.5 Male No Sun Dinner 3
## 4 23.7 3.31 Male No Sun Dinner 2
## 5 24.6 3.61 Female No Sun Dinner 4
## 6 25.3 4.71 Male No Sun Dinner 4
tail(tips)
## # A tibble: 6 Γ 7
## total_bill tip sex smoker day time size
## <dbl> <dbl> <chr> <chr> <chr> <chr> <dbl>
## 1 35.8 4.67 Female No Sat Dinner 3
## 2 29.0 5.92 Male No Sat Dinner 3
## 3 27.2 2 Female Yes Sat Dinner 2
## 4 22.7 2 Male Yes Sat Dinner 2
## 5 17.8 1.75 Male No Sat Dinner 2
## 6 18.8 3 Female No Thur Dinner 2
str(tips) # Structure: column names and types
## spc_tbl_ [244 Γ 7] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ total_bill: num [1:244] 17 10.3 21 23.7 24.6 ...
## $ tip : num [1:244] 1.01 1.66 3.5 3.31 3.61 4.71 2 3.12 1.96 3.23 ...
## $ sex : chr [1:244] "Female" "Male" "Male" "Male" ...
## $ smoker : chr [1:244] "No" "No" "No" "No" ...
## $ day : chr [1:244] "Sun" "Sun" "Sun" "Sun" ...
## $ time : chr [1:244] "Dinner" "Dinner" "Dinner" "Dinner" ...
## $ size : num [1:244] 2 3 3 2 4 4 2 4 2 2 ...
## - attr(*, "spec")=
## .. cols(
## .. total_bill = col_double(),
## .. tip = col_double(),
## .. sex = col_character(),
## .. smoker = col_character(),
## .. day = col_character(),
## .. time = col_character(),
## .. size = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
head(tips$bill_total) #use a "$" to isolate a specific variable
## Warning: Unknown or uninitialised column: `bill_total`.
## NULL
#using tidyverse
summary(tips) # Quick stats summary
## total_bill tip sex smoker
## Min. : 3.07 Min. : 1.000 Length:244 Length:244
## 1st Qu.:13.35 1st Qu.: 2.000 Class :character Class :character
## Median :17.80 Median : 2.900 Mode :character Mode :character
## Mean :19.79 Mean : 2.998
## 3rd Qu.:24.13 3rd Qu.: 3.562
## Max. :50.81 Max. :10.000
## day time size
## Length:244 Length:244 Min. :1.00
## Class :character Class :character 1st Qu.:2.00
## Mode :character Mode :character Median :2.00
## Mean :2.57
## 3rd Qu.:3.00
## Max. :6.00
dim(tips)
## [1] 244 7
glimpse(tips)
## Rows: 244
## Columns: 7
## $ total_bill <dbl> 16.99, 10.34, 21.01, 23.68, 24.59, 25.29, 8.77, 26.88, 15.0β¦
## $ tip <dbl> 1.01, 1.66, 3.50, 3.31, 3.61, 4.71, 2.00, 3.12, 1.96, 3.23,β¦
## $ sex <chr> "Female", "Male", "Male", "Male", "Female", "Male", "Male",β¦
## $ smoker <chr> "No", "No", "No", "No", "No", "No", "No", "No", "No", "No",β¦
## $ day <chr> "Sun", "Sun", "Sun", "Sun", "Sun", "Sun", "Sun", "Sun", "Suβ¦
## $ time <chr> "Dinner", "Dinner", "Dinner", "Dinner", "Dinner", "Dinner",β¦
## $ size <dbl> 2, 3, 3, 2, 4, 4, 2, 4, 2, 2, 2, 4, 2, 4, 2, 2, 3, 3, 3, 3,β¦
?cor.test
names(tips) # See all column names
## [1] "total_bill" "tip" "sex" "smoker" "day"
## [6] "time" "size"
sapply(tips, class) # View each column's data type
## total_bill tip sex smoker day time
## "numeric" "numeric" "character" "character" "character" "character"
## size
## "numeric"
glimpse(tips) # dplyr alternative to str()
## Rows: 244
## Columns: 7
## $ total_bill <dbl> 16.99, 10.34, 21.01, 23.68, 24.59, 25.29, 8.77, 26.88, 15.0β¦
## $ tip <dbl> 1.01, 1.66, 3.50, 3.31, 3.61, 4.71, 2.00, 3.12, 1.96, 3.23,β¦
## $ sex <chr> "Female", "Male", "Male", "Male", "Female", "Male", "Male",β¦
## $ smoker <chr> "No", "No", "No", "No", "No", "No", "No", "No", "No", "No",β¦
## $ day <chr> "Sun", "Sun", "Sun", "Sun", "Sun", "Sun", "Sun", "Sun", "Suβ¦
## $ time <chr> "Dinner", "Dinner", "Dinner", "Dinner", "Dinner", "Dinner",β¦
## $ size <dbl> 2, 3, 3, 2, 4, 4, 2, 4, 2, 2, 2, 4, 2, 4, 2, 2, 3, 3, 3, 3,β¦
# Convert character to factor if needed
tips$sex_Cat <- as.factor(tips$sex)
levels(tips$sex_Cat) # Show possible values for factor
## [1] "Female" "Male"
# Select specific columns
select(tips, sex, total_bill)
## # A tibble: 244 Γ 2
## sex total_bill
## <chr> <dbl>
## 1 Female 17.0
## 2 Male 10.3
## 3 Male 21.0
## 4 Male 23.7
## 5 Female 24.6
## 6 Male 25.3
## 7 Male 8.77
## 8 Male 26.9
## 9 Male 15.0
## 10 Male 14.8
## # βΉ 234 more rows
# Filter rows based on condition
filter(tips, total_bill > 20)
## # A tibble: 97 Γ 8
## total_bill tip sex smoker day time size sex_Cat
## <dbl> <dbl> <chr> <chr> <chr> <chr> <dbl> <fct>
## 1 21.0 3.5 Male No Sun Dinner 3 Male
## 2 23.7 3.31 Male No Sun Dinner 2 Male
## 3 24.6 3.61 Female No Sun Dinner 4 Female
## 4 25.3 4.71 Male No Sun Dinner 4 Male
## 5 26.9 3.12 Male No Sun Dinner 4 Male
## 6 35.3 5 Female No Sun Dinner 4 Female
## 7 21.6 3.92 Male No Sun Dinner 2 Male
## 8 20.6 3.35 Male No Sat Dinner 3 Male
## 9 20.3 2.75 Female No Sat Dinner 2 Female
## 10 39.4 7.58 Male No Sat Dinner 4 Male
## # βΉ 87 more rows
# Combine select and filter using pipes
tips_filtered <- tips %>%
select(sex, total_bill) %>%
filter(total_bill > 20)
# Select columns by position or range
select(tips, 1:3) # First 3 columns
## # A tibble: 244 Γ 3
## total_bill tip sex
## <dbl> <dbl> <chr>
## 1 17.0 1.01 Female
## 2 10.3 1.66 Male
## 3 21.0 3.5 Male
## 4 23.7 3.31 Male
## 5 24.6 3.61 Female
## 6 25.3 4.71 Male
## 7 8.77 2 Male
## 8 26.9 3.12 Male
## 9 15.0 1.96 Male
## 10 14.8 3.23 Male
## # βΉ 234 more rows
select(tips, -tip) # All except 'tip'
## # A tibble: 244 Γ 7
## total_bill sex smoker day time size sex_Cat
## <dbl> <chr> <chr> <chr> <chr> <dbl> <fct>
## 1 17.0 Female No Sun Dinner 2 Female
## 2 10.3 Male No Sun Dinner 3 Male
## 3 21.0 Male No Sun Dinner 3 Male
## 4 23.7 Male No Sun Dinner 2 Male
## 5 24.6 Female No Sun Dinner 4 Female
## 6 25.3 Male No Sun Dinner 4 Male
## 7 8.77 Male No Sun Dinner 2 Male
## 8 26.9 Male No Sun Dinner 4 Male
## 9 15.0 Male No Sun Dinner 2 Male
## 10 14.8 Male No Sun Dinner 2 Male
## # βΉ 234 more rows
# Group by a category and summarize
tips %>%
group_by(sex) %>%
summarise(
count = n(),
avg_tip = mean(tip),
max_tip = max(tip)
)
## # A tibble: 2 Γ 4
## sex count avg_tip max_tip
## <chr> <int> <dbl> <dbl>
## 1 Female 87 2.83 6.5
## 2 Male 157 3.09 10
# Group by two variables
tips %>%
group_by(sex, smoker) %>%
summarise(
mean_total = mean(total_bill),
.groups = "drop" # Removes group structure
)
## # A tibble: 4 Γ 3
## sex smoker mean_total
## <chr> <chr> <dbl>
## 1 Female No 18.1
## 2 Female Yes 18.0
## 3 Male No 19.8
## 4 Male Yes 22.3
# Check for missing values
colSums(is.na(tips))
## total_bill tip sex smoker day time size
## 0 0 0 0 0 0 0
## sex_Cat
## 0
# Remove rows with any missing values
tips_clean <- na.omit(tips)
# Replace NAs with 0 in tip column
tips$tip[is.na(tips$tip)] <- 0
# Rename columns
tips <- tips %>%
rename(
bill_total = total_bill,
waiter_tip = tip
)
# Create a new column: tip percentage
tips <- tips %>%
mutate(
tip_percent = (waiter_tip / bill_total) * 100,
tip_percent = round(tip_percent, 1)
)
# Sort by tip percentage descending
tips %>%
arrange(desc(tip_percent)) %>%
head(10)
## # A tibble: 10 Γ 9
## bill_total waiter_tip sex smoker day time size sex_Cat tip_percent
## <dbl> <dbl> <chr> <chr> <chr> <chr> <dbl> <fct> <dbl>
## 1 7.25 5.15 Male Yes Sun Dinner 2 Male 71
## 2 9.6 4 Female Yes Sun Dinner 2 Female 41.7
## 3 3.07 1 Female Yes Sat Dinner 1 Female 32.6
## 4 11.6 3.39 Male No Sat Dinner 2 Male 29.2
## 5 23.2 6.5 Male Yes Sun Dinner 4 Male 28.1
## 6 14.3 4 Female Yes Sat Dinner 2 Female 28
## 7 7.51 2 Male No Thur Lunch 2 Male 26.6
## 8 16.3 4.3 Female Yes Fri Dinner 2 Female 26.3
## 9 13.4 3.48 Female Yes Fri Lunch 2 Female 25.9
## 10 10.3 2.6 Female No Sun Dinner 2 Female 25.3
# Mean tip by sex
aggregate(waiter_tip ~ sex, data = tips, FUN = mean)
## sex waiter_tip
## 1 Female 2.833448
## 2 Male 3.089618
# Mean and max tip by sex and smoker status
aggregate(waiter_tip ~ sex + smoker, data = tips, FUN = function(x) c(mean = mean(x), max = max(x)))
## sex smoker waiter_tip.mean waiter_tip.max
## 1 Female No 2.773519 5.200000
## 2 Male No 3.113402 9.000000
## 3 Female Yes 2.931515 6.500000
## 4 Male Yes 3.051167 10.000000
# Load dplyr if not already
library(dplyr)
# Mean tip by sex
tips %>%
group_by(sex) %>%
summarise(mean_tip = mean(waiter_tip, na.rm = TRUE))
## # A tibble: 2 Γ 2
## sex mean_tip
## <chr> <dbl>
## 1 Female 2.83
## 2 Male 3.09
# Mean and max tip by sex and smoker
tips %>%
group_by(sex, smoker) %>%
summarise(
mean_tip = mean(waiter_tip, na.rm = TRUE),
max_tip = max(waiter_tip, na.rm = TRUE),
.groups = "drop" # Optional: drops grouping after summarise
)
## # A tibble: 4 Γ 4
## sex smoker mean_tip max_tip
## <chr> <chr> <dbl> <dbl>
## 1 Female No 2.77 5.2
## 2 Female Yes 2.93 6.5
## 3 Male No 3.11 9
## 4 Male Yes 3.05 10
# Count number of observations by group (base R)
table(tips$sex, tips$smoker)
##
## No Yes
## Female 54 33
## Male 97 60
# Count with dplyr
tips %>%
count(sex, smoker)
## # A tibble: 4 Γ 3
## sex smoker n
## <chr> <chr> <int>
## 1 Female No 54
## 2 Female Yes 33
## 3 Male No 97
## 4 Male Yes 60
# Load built-in mpg dataset from ggplot2
data(mpg)
# Preview it
glimpse(mpg)
## Rows: 234
## Columns: 11
## $ manufacturer <chr> "audi", "audi", "audi", "audi", "audi", "audi", "audi", "β¦
## $ model <chr> "a4", "a4", "a4", "a4", "a4", "a4", "a4", "a4 quattro", "β¦
## $ displ <dbl> 1.8, 1.8, 2.0, 2.0, 2.8, 2.8, 3.1, 1.8, 1.8, 2.0, 2.0, 2.β¦
## $ year <int> 1999, 1999, 2008, 2008, 1999, 1999, 2008, 1999, 1999, 200β¦
## $ cyl <int> 4, 4, 4, 4, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 8, 8, β¦
## $ trans <chr> "auto(l5)", "manual(m5)", "manual(m6)", "auto(av)", "autoβ¦
## $ drv <chr> "f", "f", "f", "f", "f", "f", "f", "4", "4", "4", "4", "4β¦
## $ cty <int> 18, 21, 20, 21, 16, 18, 18, 18, 16, 20, 19, 15, 17, 17, 1β¦
## $ hwy <int> 29, 29, 31, 30, 26, 26, 27, 26, 25, 28, 27, 25, 25, 25, 2β¦
## $ fl <chr> "p", "p", "p", "p", "p", "p", "p", "p", "p", "p", "p", "pβ¦
## $ class <chr> "compact", "compact", "compact", "compact", "compact", "cβ¦
# Assignment steps
mpg_new <- mpg %>%
select(manufacturer, model, cty, hwy) %>%
mutate(avg_mpg = (cty + hwy) / 2) %>%
filter(avg_mpg > 25) %>%
drop_na() %>%
rename(brand = manufacturer)
# View result
head(mpg_new)
## # A tibble: 6 Γ 5
## brand model cty hwy avg_mpg
## <chr> <chr> <int> <int> <dbl>
## 1 audi a4 20 31 25.5
## 2 audi a4 21 30 25.5
## 3 chevrolet malibu 22 30 26
## 4 honda civic 28 33 30.5
## 5 honda civic 24 32 28
## 6 honda civic 25 32 28.5