#Q1 - read data file car-price.csv from Downloads folder
car_price <- read.csv("/Users/swapnilkumar/Downloads/car-price.csv")
#print first 5 rows
head(car_price)
## Car.Category Car.Model MSRP.in.CA City.MPG Highway.MPG Weight..lbs.
## 1 Compact Sedan Honda Civic $23,400 31 40 2,800
## 2 Compact Sedan Toyota Corolla $21,300 30 38 2,800
## 3 Compact Sedan Mazda3 $24,900 27 36 2,900
## 4 Compact Sedan Hyundai Elantra $20,900 30 38 2,800
## 5 Compact Sedan Nissan Sentra $20,300 29 37 2,800
## 6 Compact Sedan Kia Forte $20,500 30 37 2,800
## Country.of.Origin
## 1 Japan
## 2 Japan
## 3 Japan
## 4 South Korea
## 5 Japan
## 6 South Korea
#Q2 - barplot for frequency of each country of origin
# count of distinct country of origin
table(car_price$Country.of.Origin)
##
## Germany Italy Japan South Korea Sweden
## 7 1 25 9 1
## United States
## 17
#Create a bar plot of the frequency of each country of origin
barplot(table(car_price$Country.of.Origin), main="Country of Origin", xlab="Country", ylab="Frequency")

#Add colors to the bar plot
barplot(table(car_price$Country.of.Origin), main="Country of Origin", xlab="Country", ylab="Frequency", col=rainbow(6))

#Add legends to the bar plot
barplot(table(car_price$Country.of.Origin), main="Country of Origin", xlab="Country", ylab="Frequency", col=rainbow(6), legend=rownames(table(car_price$Country.of.Origin)))

# sort the countries by frequency in descending order
sorted_countries <- sort(table(car_price$Country.of.Origin), decreasing = TRUE)
barplot(sorted_countries, main="Country of Origin", xlab="Country", ylab="Frequency", col=rainbow(6), legend=names(sorted_countries))
#data clean ups for accounting numeric values
car_price$MSRP.in.CA <- as.numeric(gsub("[\\$,]", "", car_price$MSRP.in.CA))
car_price <- car_price[is.finite(car_price$City.MPG) & is.finite(car_price$MSRP.in.CA), ]
#Question no. 3 - scatter plot using ggplot2
library(ggplot2)

#create a scatter plot of MSRP vs City MPG : MSRP - Y axis and City MPG - X axis
ggplot(car_price, aes(x=City.MPG, y=MSRP.in.CA)) + geom_point() + ggtitle("MSRP vs City MPG")

#add the two categorical variables, "Car.Category" and "Country of Origin," to the plot
ggplot(car_price, aes(x=City.MPG, y=MSRP.in.CA, color=Car.Category, shape=Country.of.Origin)) + geom_point() + ggtitle("MSRP vs City MPG")

#Create a scatter plot of MSRP vs Highway MPG : MSRP - Y axis and Highway MPG - X axis
ggplot(car_price, aes(x=Highway.MPG, y=MSRP.in.CA)) + geom_point() + ggtitle("MSRP vs Highway MPG")

#add the two categorical variables, "Car.Category" and "Country of Origin," to the plot
ggplot(car_price, aes(x=Highway.MPG, y=MSRP.in.CA, color=Car.Category, shape=Country.of.Origin)) + geom_point() + ggtitle("MSRP vs Highway MPG")

#Question 4 - box plot using ggplot2 and boxplot()
#Create side-by-side box plots of MSRP across different categories
ggplot(car_price, aes(x=Car.Category, y=MSRP.in.CA)) + geom_boxplot() + ggtitle("MSRP by Car Category")

#using boxplot()
boxplot(car_price$MSRP ~ car_price$Country.of.Origin, main="Box plot of MSRP across different categories", xlab="Country of Origin", ylab="MSRP")

# add interpretations
#The box plot shows the distribution of MSRP values across different car categories. The box represents the interquartile range (IQR) of the data, with the median value indicated by the horizontal line inside the box. The whiskers extend to the minimum and maximum values within 1.5 times the IQR from the lower and upper quartiles, respectively. Outliers are represented as individual points outside the whiskers.
#Create side-by-side box plots of City MPG across different categories
ggplot(car_price, aes(x=Car.Category, y=City.MPG)) + geom_boxplot() + ggtitle("City MPG by Car Category")

#using boxplot() to create the side-by-side box plots
boxplot(car_price$City.MPG ~ car_price$Country.of.Origin, main="Box plot of City MPG across different categories", xlab="Country of Origin", ylab="City MPG")

#add interpretations
#The box plot shows the distribution of City MPG values across different car categories. The box represents the interquartile range (IQR) of the data, with the median value indicated by the horizontal line inside the box. The whiskers extend to the minimum and maximum values within 1.5 times the IQR from the lower and upper quartiles, respectively. Outliers are represented as individual points outside the whiskers.
#Question 5 - Create a pivot table reporting the average MSRP and average City MPG across different categories
# Create a pivot table reporting the average MSRP and average City MPG across different categories
pivot_table <- aggregate(car_price[, c("MSRP.in.CA", "City.MPG")], by=list(car_price$Country.of.Origin), FUN=mean)
colnames(pivot_table) <- c("Country.of.Origin", "Average.MSRP", "Average.City.MPG")
pivot_table
## Country.of.Origin Average.MSRP Average.City.MPG
## 1 Germany 34328.57 25.00000
## 2 Italy 43900.00 24.00000
## 3 Japan 30324.00 25.44000
## 4 South Korea 28766.67 25.88889
## 5 Sweden 41900.00 25.00000
## 6 United States 30529.41 23.29412
# Provide an appropriate visualization of the patterns you observe.
barplot(pivot_table$Average.MSRP, main="Average MSRP across different categories", xlab="Country of Origin", ylab="Average MSRP", col=rainbow(6), legend=pivot_table$Country.of.Origin)
