Data Summary

Setting up working directory

getwd()

## [1] "/Users/markyuhasz/FallWinter 2024 Classes/MKTG3P98-Business analytics and intell/Assignment1"

setwd("/Users/markyuhasz/FallWinter 2024 Classes/MKTG3P98-Business analytics and intell/Assignment1")

Install/Select Packages.

library(readxl)
library(ggplot2)
library(readr) #To read CSV files
library(stringr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Read the CSV files and get accurate structure of data frame stats.

Car1<-read.csv("Car_Survey_1a.csv")
str(Car1)

## 'data.frame':    1180 obs. of  23 variables:
##  $ Resp        : chr  "Res1" "Res2" "Res3" "Res4" ...
##  $ Att_1       : int  6 7 7 4 6 6 1 6 3 6 ...
##  $ Att_2       : int  6 5 7 1 6 6 1 5 2 6 ...
##  $ Enj_1       : int  6 5 7 1 6 6 1 5 3 4 ...
##  $ Enj_2       : int  6 2 5 1 5 5 1 3 2 4 ...
##  $ Perform_1   : int  5 2 5 1 5 5 2 5 2 4 ...
##  $ Perform_2   : int  6 6 5 1 2 5 2 5 3 4 ...
##  $ Perform_3   : int  3 7 3 1 1 7 2 2 1 1 ...
##  $ WOM_1       : int  3 5 6 7 7 5 2 4 6 5 ...
##  $ WOM_2       : int  3 5 6 7 7 5 3 6 6 6 ...
##  $ Futu_Pur_1  : int  3 6 7 3 7 7 5 4 7 6 ...
##  $ Futu_Pur_2  : int  3 6 7 3 6 7 2 4 7 6 ...
##  $ Valu_Percp_1: int  5 6 5 6 6 7 2 4 6 6 ...
##  $ Valu_Percp_2: int  2 7 7 5 5 7 2 4 6 6 ...
##  $ Pur_Proces_1: int  6 7 7 5 6 7 2 4 6 6 ...
##  $ Pur_Proces_2: int  4 6 7 4 7 7 6 4 6 6 ...
##  $ Residence   : int  2 2 1 2 1 2 2 1 2 1 ...
##  $ Pay_Meth    : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ Insur_Type  : chr  "Collision" "Collision" "Collision" "Collision" ...
##  $ Gender      : chr  "Male" "Male" "Male" "Male" ...
##  $ Age         : int  18 18 19 19 19 19 19 21 21 21 ...
##  $ Education   : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ X           : logi  NA NA NA NA NA NA ...

Car2<-read.csv("Car_Survey_2a.csv")
str(Car2)

## 'data.frame':    1049 obs. of  9 variables:
##  $ Respondents: chr  "Res1" "Res2" "Res3" "Res4" ...
##  $ Region     : chr  "European" "European" "European" "European" ...
##  $ Model      : chr  "Ford Expedition" "Ford Expedition" "Ford Expedition" "Ford Expedition" ...
##  $ MPG        : int  15 15 15 15 15 15 15 15 15 15 ...
##  $ Cyl        : int  8 8 8 8 8 8 8 8 8 8 ...
##  $ acc1       : num  5.5 5.5 5.5 5.5 5.5 5.5 5.5 5.5 5.5 5.5 ...
##  $ C_cost.    : num  16 16 16 16 16 16 16 16 16 16 ...
##  $ H_Cost     : num  14 14 14 14 14 14 14 14 14 14 ...
##  $ Post.Satis : int  4 3 5 5 5 3 3 6 3 5 ...

Rename Unique ID in Car2 to match Car1

names(Car2)[1]<-c("Resp")
head(Car2, n = 1)

##   Resp   Region           Model MPG Cyl acc1 C_cost. H_Cost Post.Satis
## 1 Res1 European Ford Expedition  15   8  5.5      16     14          4

Merge Car1 and Car2 into one data set

Car_Total <- merge(Car1, Car2, by = "Resp")

Finding numeric columns to remove NA values

numeric_cols <- sapply(Car_Total, is.numeric)

Replacing NA Values with calulated means for all numeric columns

Car_Total[, numeric_cols] <- lapply(Car_Total[, numeric_cols], function(x) {
  mean_val <- mean (x, na.rm = TRUE)
  x[is.na(x)] <- mean_val
  return(x)
})

Verify Replacement

summary(Car_Total)

##      Resp               Att_1           Att_2           Enj_1      
##  Length:1049        Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  Class :character   1st Qu.:4.000   1st Qu.:4.000   1st Qu.:5.000  
##  Mode  :character   Median :5.000   Median :6.000   Median :6.000  
##                     Mean   :4.882   Mean   :5.287   Mean   :5.378  
##                     3rd Qu.:6.000   3rd Qu.:6.000   3rd Qu.:7.000  
##                     Max.   :7.000   Max.   :7.000   Max.   :7.000  
##      Enj_2         Perform_1       Perform_2       Perform_3    
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:3.000   1st Qu.:4.000   1st Qu.:4.000   1st Qu.:3.000  
##  Median :5.000   Median :5.000   Median :5.000   Median :5.000  
##  Mean   :4.575   Mean   :4.947   Mean   :4.831   Mean   :4.217  
##  3rd Qu.:6.000   3rd Qu.:6.000   3rd Qu.:6.000   3rd Qu.:6.000  
##  Max.   :7.000   Max.   :7.000   Max.   :7.000   Max.   :7.000  
##      WOM_1           WOM_2        Futu_Pur_1      Futu_Pur_2     Valu_Percp_1  
##  Min.   :1.000   Min.   :1.00   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:4.000   1st Qu.:4.00   1st Qu.:5.000   1st Qu.:5.000   1st Qu.:5.000  
##  Median :6.000   Median :6.00   Median :6.000   Median :6.000   Median :6.000  
##  Mean   :5.286   Mean   :5.35   Mean   :5.321   Mean   :5.371   Mean   :5.411  
##  3rd Qu.:7.000   3rd Qu.:6.00   3rd Qu.:6.000   3rd Qu.:6.000   3rd Qu.:6.000  
##  Max.   :7.000   Max.   :7.00   Max.   :9.000   Max.   :7.000   Max.   :7.000  
##   Valu_Percp_2    Pur_Proces_1    Pur_Proces_2     Residence    
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:4.000   1st Qu.:5.000   1st Qu.:4.000   1st Qu.:1.000  
##  Median :5.000   Median :6.000   Median :5.000   Median :1.000  
##  Mean   :5.114   Mean   :5.256   Mean   :4.923   Mean   :1.474  
##  3rd Qu.:6.000   3rd Qu.:6.000   3rd Qu.:6.000   3rd Qu.:2.000  
##  Max.   :7.000   Max.   :7.000   Max.   :7.000   Max.   :5.000  
##     Pay_Meth      Insur_Type           Gender               Age       
##  Min.   :1.000   Length:1049        Length:1049        Min.   :18.00  
##  1st Qu.:1.000   Class :character   Class :character   1st Qu.:23.00  
##  Median :2.000   Mode  :character   Mode  :character   Median :34.00  
##  Mean   :2.153                                         Mean   :35.22  
##  3rd Qu.:3.000                                         3rd Qu.:48.00  
##  Max.   :3.000                                         Max.   :60.00  
##    Education        X              Region             Model          
##  Min.   :1.000   Mode:logical   Length:1049        Length:1049       
##  1st Qu.:2.000   NA's:1049      Class :character   Class :character  
##  Median :2.000                  Mode  :character   Mode  :character  
##  Mean   :1.989                                                       
##  3rd Qu.:2.000                                                       
##  Max.   :3.000                                                       
##       MPG             Cyl           acc1          C_cost.          H_Cost      
##  Min.   :14.00   Min.   :4.0   Min.   :3.600   Min.   : 7.00   Min.   : 6.000  
##  1st Qu.:17.00   1st Qu.:4.0   1st Qu.:5.100   1st Qu.:10.00   1st Qu.: 8.000  
##  Median :19.00   Median :6.0   Median :6.500   Median :12.00   Median :10.000  
##  Mean   :19.58   Mean   :5.8   Mean   :6.202   Mean   :11.35   Mean   : 9.634  
##  3rd Qu.:22.00   3rd Qu.:6.0   3rd Qu.:7.500   3rd Qu.:13.00   3rd Qu.:11.000  
##  Max.   :26.00   Max.   :8.0   Max.   :8.500   Max.   :16.00   Max.   :14.000  
##    Post.Satis  
##  Min.   :2.00  
##  1st Qu.:5.00  
##  Median :6.00  
##  Mean   :5.28  
##  3rd Qu.:6.00  
##  Max.   :7.00

Creating a chart to depict number of people based on car models

ggplot(Car_Total, aes(x = Model, fill = Model)) +
  geom_bar() +
  theme_bw() +
  labs(title = " Number of People Owning Different Car Models",
       x = "Car Model",
       y = "Number") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Separate Model of Car from Make of Car.

Car_Total[c('Make', 'Model_v1')] <- str_split_fixed(Car_Total$Model, " ", 2) #slips it by second character

Mutate Data set to indicate Parent Companies of Car Brands

Car_Total <- Car_Total %>% #call dataframe and create new column with new groupings
  mutate(Parent = case_when(Make == "Buick" ~ "General Motors",
                            Make == "Chevrolet" ~ "General Motors",
                            Make == "Chrysler" ~ "Chrysler",
                            Make == "Dodge" ~ "Chrysler",
                            Make == "Fiat" ~ "Chrysler",
                            Make == "Ford" ~ "Ford",
                            Make == "Honda" ~ "Honda",
                            Make == "Kia" ~ "Kia",
                            Make == "Lincoln" ~ "Ford",
                            Make == "Toyota" ~ "Toyota",
                            TRUE ~ "Check"))

Car_Total$Parent<-as.factor(Car_Total$Parent)

Graphing Post Purchase Satisfaction based on Parent Company

p <-ggplot(Car_Total, aes(x=Post.Satis, fill = Parent)) +
  theme_bw() +
  geom_bar(position = 'dodge2') +
       labs(y='Number of Cars',
            x = 'Post Satisfaction',
            title = "Post Purchase Satisfaction by Make")
p + annotate("text", x=3, y=70, label = "1 = Dissatified, 7 = Very Satified")

Make a new column for Future Purchase Intentions

Car_Total$Futmean = (Car_Total$Futu_Pur_1 + Car_Total$Futu_Pur_2) / 2
View(Car_Total[c("Futu_Pur_1", "Futu_Pur_2", "Futmean")])

Filter Based on Toyota, Ford, Honda and Chrysler

Car_Total_Multi <- Car_Total %>%
  filter(Parent %in% c("Ford", "Toyota", "Honda", "Chrysler"))

Visualize Data Based on Parent Company

f <-ggplot(Car_Total_Multi, aes(x = factor(round(Futmean)), fill = Parent)) +
  geom_bar(position = 'dodge') +
  labs(y = 'Owners Rating',
       x = "Future Purchase Intention",
       title = "Number of People that will/likely Purchase the same Car") +
  theme_bw()
f + annotate("text", x=3, y=80, label = "1 = Strongly Disagree, 7 = Strongly Agree")

### Value Perception Contingency Table

Car_Total$Valu_mean <- (Car_Total$Valu_Percp_1 + Car_Total$Valu_Percp_2) / 2
brand_region_table <- aggregate(Valu_mean~Parent + Region, Car_Total, mean)
print(brand_region_table)

##            Parent         Region Valu_mean
## 1        Chrysler       American  5.143597
## 2            Ford       American  5.061728
## 3  General Motors       American  5.333333
## 4           Honda       American  5.026316
## 5             Kia       American  5.666667
## 6          Toyota       American  5.119353
## 7        Chrysler          Asian  4.791667
## 8            Ford          Asian  5.062500
## 9           Honda          Asian  5.422414
## 10         Toyota          Asian  5.127451
## 11       Chrysler       European  5.340909
## 12           Ford       European  5.637931
## 13 General Motors       European  4.854839
## 14          Honda       European  5.413793
## 15            Kia       European  5.421053
## 16         Toyota       European  5.281250
## 17       Chrysler Middle Eastern  5.489691
## 18           Ford Middle Eastern  5.485981
## 19 General Motors Middle Eastern  5.120000
## 20          Honda Middle Eastern  5.455882
## 21         Toyota Middle Eastern  5.750000

Filtering to only show top four companies

Multiple_Valu_mean <- brand_region_table %>%
  filter(Parent == "Honda" | Parent == "Toyota" | Parent == "Ford" | Parent == "Chrysler")
head(Multiple_Valu_mean, n=10)

##      Parent   Region Valu_mean
## 1  Chrysler American  5.143597
## 2      Ford American  5.061728
## 3     Honda American  5.026316
## 4    Toyota American  5.119353
## 5  Chrysler    Asian  4.791667
## 6      Ford    Asian  5.062500
## 7     Honda    Asian  5.422414
## 8    Toyota    Asian  5.127451
## 9  Chrysler European  5.340909
## 10     Ford European  5.637931

table(Multiple_Valu_mean$Parent)

## 
##       Chrysler           Ford General Motors          Honda            Kia 
##              4              4              0              4              0 
##         Toyota 
##              4

Displaying Data Visualization

ggplot(Multiple_Valu_mean, aes(x=Region, y=Valu_mean, group=Parent)) +
  geom_line(aes(color=Parent)) +
  geom_point(aes(color=Parent)) +
  scale_y_continuous(limits = c(4.5,6))

labs(y="Value Mean", 
     title = "Value Mean for Toyota, Honda, Ford and Chrysler by Region")

## $y
## [1] "Value Mean"
## 
## $title
## [1] "Value Mean for Toyota, Honda, Ford and Chrysler by Region"
## 
## attr(,"class")
## [1] "labels"

Create three categorical values for age

Car_Total$AgeGrp<-cut(Car_Total$Age,
                      breaks = c(0, 30, 50, Inf),
                      labels = c("Young adults", "Adults", "Mature Adults"),
                      right = FALSE)
names(Car_Total)

##  [1] "Resp"         "Att_1"        "Att_2"        "Enj_1"        "Enj_2"       
##  [6] "Perform_1"    "Perform_2"    "Perform_3"    "WOM_1"        "WOM_2"       
## [11] "Futu_Pur_1"   "Futu_Pur_2"   "Valu_Percp_1" "Valu_Percp_2" "Pur_Proces_1"
## [16] "Pur_Proces_2" "Residence"    "Pay_Meth"     "Insur_Type"   "Gender"      
## [21] "Age"          "Education"    "X"            "Region"       "Model"       
## [26] "MPG"          "Cyl"          "acc1"         "C_cost."      "H_Cost"      
## [31] "Post.Satis"   "Make"         "Model_v1"     "Parent"       "Futmean"     
## [36] "Valu_mean"    "AgeGrp"

head(Car_Total$AgeGrp)

## [1] Young adults Young adults Adults       Young adults Young adults
## [6] Young adults
## Levels: Young adults Adults Mature Adults

Visualization of age groups and car make purchases

ggplot(Car_Total, aes(x=Parent, fill = AgeGrp)) +
  theme_bw() +
  geom_bar(position = 'dodge') +
  labs(y="Number of People",
       x="Age",
       title = "Age Groups Purchasing Make")

Assignment 1

Marc Yuhasz

2024-10-08

Introduction

This Report is for Individual Assignment 1

Data Summary

Setting up working directory

Install/Select Packages.

Read the CSV files and get accurate structure of data frame stats.

Rename Unique ID in Car2 to match Car1

Merge Car1 and Car2 into one data set

Finding numeric columns to remove NA values

Replacing NA Values with calulated means for all numeric columns

Verify Replacement

Creating a chart to depict number of people based on car models

Separate Model of Car from Make of Car.

Mutate Data set to indicate Parent Companies of Car Brands

Graphing Post Purchase Satisfaction based on Parent Company

Make a new column for Future Purchase Intentions

Filter Based on Toyota, Ford, Honda and Chrysler

Visualize Data Based on Parent Company

Filtering to only show top four companies

Displaying Data Visualization

Create three categorical values for age

Visualization of age groups and car make purchases