This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)
library(dplyr)
library(ggplot2)
library(janitor)
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
# Define file paths
file1 <- "/Users/ethanbalogh/Downloads/Car_Survey_1.xlsx"
file2 <- "/Users/ethanbalogh/Downloads/Car_Survey_2.xlsx"
# Read the Excel files
df1 <- read_excel(file1)
df2 <- read_excel(file2)
# Combine both datasets into one
df2 <- df2 %>% rename(Resp = Respondents) # Rename to match df1
df <- df1 %>% left_join(df2, by = "Resp")
# View the first few rows of the data
head(df)
## # A tibble: 6 × 30
## Resp Att_1 Att_2 Enj_1 Enj_2 Perform_1 Perform_2 Perform_3 WOM_1 WOM_2
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Res1 6 6 6 6 5 6 3 3 3
## 2 Res2 7 5 5 2 2 6 7 5 5
## 3 Res3 7 7 7 5 5 5 3 6 6
## 4 Res4 4 1 1 1 1 1 1 7 7
## 5 Res5 6 6 6 5 5 2 1 7 7
## 6 Res6 6 6 6 5 5 5 7 5 5
## # ℹ 20 more variables: Futu_Pur_1 <dbl>, Futu_Pur_2 <dbl>, Valu_Percp_1 <dbl>,
## # Valu_Percp_2 <dbl>, Pur_Proces_1 <dbl>, Pur_Proces_2 <dbl>,
## # Residence <dbl>, Pay_Meth <dbl>, Insur_Type <chr>, Gender <chr>, Age <dbl>,
## # Education <dbl>, Region <chr>, Model <chr>, MPG <dbl>, Cyl <dbl>,
## # acc1 <dbl>, `C_cost ` <dbl>, H_Cost <dbl>, `Post-Satis` <dbl>
df <- clean_names(df) # Removes spaces and special characters in column names
colnames(df) # Check the updated column names
## [1] "resp" "att_1" "att_2" "enj_1" "enj_2"
## [6] "perform_1" "perform_2" "perform_3" "wom_1" "wom_2"
## [11] "futu_pur_1" "futu_pur_2" "valu_percp_1" "valu_percp_2" "pur_proces_1"
## [16] "pur_proces_2" "residence" "pay_meth" "insur_type" "gender"
## [21] "age" "education" "region" "model" "mpg"
## [26] "cyl" "acc1" "c_cost" "h_cost" "post_satis"
colSums(is.na(df)) # Summarizes missing values per column
## resp att_1 att_2 enj_1 enj_2 perform_1
## 0 4 0 4 4 2
## perform_2 perform_3 wom_1 wom_2 futu_pur_1 futu_pur_2
## 4 1 1 3 5 2
## valu_percp_1 valu_percp_2 pur_proces_1 pur_proces_2 residence pay_meth
## 4 1 3 4 5 0
## insur_type gender age education region model
## 7 3 0 0 0 0
## mpg cyl acc1 c_cost h_cost post_satis
## 0 0 0 0 0 0
df <- df %>% drop_na() # Remove Rows with Excessive Missing Values
df <- df %>% distinct() # Remove Duplicate Rows
# Create age group column
df <- df %>%
mutate(age_group = cut(age, breaks = c(18, 30, 50, 70, 100),
labels = c("Young", "Middle-aged", "Older Adult", "Senior")))
# Extract the brand (first word of the model)
df <- df %>%
mutate(brand = word(model, 1))
# Plot car purchases by brand
ggplot(df, aes(x = brand)) +
geom_bar(fill = "steelblue") +
theme_minimal() +
labs(title = "Car Purchases by Brand", x = "Brand", y = "Count")
#Plot car purchases by age
ggplot(df, aes(x = age_group, fill = model)) +
geom_bar(position = "dodge") +
theme_minimal() +
labs(title = "Car Purchases by Age Group", x = "Age Group", y = "Count")
You can also embed plots, for example:
Note that the echo = FALSE
parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.