R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)
library(dplyr)
library(ggplot2)
library(janitor)
## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
# Define file paths
file1 <- "/Users/ethanbalogh/Downloads/Car_Survey_1.xlsx"
file2 <- "/Users/ethanbalogh/Downloads/Car_Survey_2.xlsx"

# Read the Excel files
df1 <- read_excel(file1)
df2 <- read_excel(file2)

# Combine both datasets into one
df2 <- df2 %>% rename(Resp = Respondents)  # Rename to match df1

df <- df1 %>% left_join(df2, by = "Resp")

# View the first few rows of the data
head(df)
## # A tibble: 6 × 30
##   Resp  Att_1 Att_2 Enj_1 Enj_2 Perform_1 Perform_2 Perform_3 WOM_1 WOM_2
##   <chr> <dbl> <dbl> <dbl> <dbl>     <dbl>     <dbl>     <dbl> <dbl> <dbl>
## 1 Res1      6     6     6     6         5         6         3     3     3
## 2 Res2      7     5     5     2         2         6         7     5     5
## 3 Res3      7     7     7     5         5         5         3     6     6
## 4 Res4      4     1     1     1         1         1         1     7     7
## 5 Res5      6     6     6     5         5         2         1     7     7
## 6 Res6      6     6     6     5         5         5         7     5     5
## # ℹ 20 more variables: Futu_Pur_1 <dbl>, Futu_Pur_2 <dbl>, Valu_Percp_1 <dbl>,
## #   Valu_Percp_2 <dbl>, Pur_Proces_1 <dbl>, Pur_Proces_2 <dbl>,
## #   Residence <dbl>, Pay_Meth <dbl>, Insur_Type <chr>, Gender <chr>, Age <dbl>,
## #   Education <dbl>, Region <chr>, Model <chr>, MPG <dbl>, Cyl <dbl>,
## #   acc1 <dbl>, `C_cost ` <dbl>, H_Cost <dbl>, `Post-Satis` <dbl>
df <- clean_names(df)  # Removes spaces and special characters in column names
colnames(df)  # Check the updated column names
##  [1] "resp"         "att_1"        "att_2"        "enj_1"        "enj_2"       
##  [6] "perform_1"    "perform_2"    "perform_3"    "wom_1"        "wom_2"       
## [11] "futu_pur_1"   "futu_pur_2"   "valu_percp_1" "valu_percp_2" "pur_proces_1"
## [16] "pur_proces_2" "residence"    "pay_meth"     "insur_type"   "gender"      
## [21] "age"          "education"    "region"       "model"        "mpg"         
## [26] "cyl"          "acc1"         "c_cost"       "h_cost"       "post_satis"
colSums(is.na(df))  # Summarizes missing values per column
##         resp        att_1        att_2        enj_1        enj_2    perform_1 
##            0            4            0            4            4            2 
##    perform_2    perform_3        wom_1        wom_2   futu_pur_1   futu_pur_2 
##            4            1            1            3            5            2 
## valu_percp_1 valu_percp_2 pur_proces_1 pur_proces_2    residence     pay_meth 
##            4            1            3            4            5            0 
##   insur_type       gender          age    education       region        model 
##            7            3            0            0            0            0 
##          mpg          cyl         acc1       c_cost       h_cost   post_satis 
##            0            0            0            0            0            0
df <- df %>% drop_na() # Remove Rows with Excessive Missing Values
df <- df %>% distinct() # Remove Duplicate Rows


# Create age group column
df <- df %>%
  mutate(age_group = cut(age, breaks = c(18, 30, 50, 70, 100), 
                         labels = c("Young", "Middle-aged", "Older Adult", "Senior")))

# Extract the brand (first word of the model)
df <- df %>%
  mutate(brand = word(model, 1))

# Plot car purchases by brand
ggplot(df, aes(x = brand)) +
  geom_bar(fill = "steelblue") +
  theme_minimal() +
  labs(title = "Car Purchases by Brand", x = "Brand", y = "Count")

#Plot car purchases by age
ggplot(df, aes(x = age_group, fill = model)) +
  geom_bar(position = "dodge") +
  theme_minimal() +
  labs(title = "Car Purchases by Age Group", x = "Age Group", y = "Count")

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.