#Q1 - read data file car-price.csv from Downloads folder
car_price <- read.csv("/Users/swapnilkumar/Downloads/car-price.csv")
#print first 5 rows
head(car_price)
##    Car.Category       Car.Model MSRP.in.CA City.MPG Highway.MPG Weight..lbs.
## 1 Compact Sedan     Honda Civic    $23,400       31          40        2,800
## 2 Compact Sedan  Toyota Corolla    $21,300       30          38        2,800
## 3 Compact Sedan          Mazda3    $24,900       27          36        2,900
## 4 Compact Sedan Hyundai Elantra    $20,900       30          38        2,800
## 5 Compact Sedan   Nissan Sentra    $20,300       29          37        2,800
## 6 Compact Sedan       Kia Forte    $20,500       30          37        2,800
##   Country.of.Origin
## 1             Japan
## 2             Japan
## 3             Japan
## 4       South Korea
## 5             Japan
## 6       South Korea
#Q2 - barplot for frequency of each country of origin
# count of distinct country of origin
table(car_price$Country.of.Origin)
## 
##       Germany         Italy         Japan   South Korea        Sweden 
##             7             1            25             9             1 
## United States 
##            17
#Create a bar plot of the frequency of each country of origin
barplot(table(car_price$Country.of.Origin), main="Country of Origin", xlab="Country", ylab="Frequency")

#Add colors to the bar plot
barplot(table(car_price$Country.of.Origin), main="Country of Origin", xlab="Country", ylab="Frequency", col=rainbow(6))

#Add legends to the bar plot
barplot(table(car_price$Country.of.Origin), main="Country of Origin", xlab="Country", ylab="Frequency", col=rainbow(6), legend=rownames(table(car_price$Country.of.Origin)))

# sort the countries by frequency in descending order
sorted_countries <- sort(table(car_price$Country.of.Origin), decreasing = TRUE)
barplot(sorted_countries, main="Country of Origin", xlab="Country", ylab="Frequency", col=rainbow(6), legend=names(sorted_countries))

#data clean ups for accounting numeric values
car_price$MSRP.in.CA <- as.numeric(gsub("[\\$,]", "", car_price$MSRP.in.CA))
car_price <- car_price[is.finite(car_price$City.MPG) & is.finite(car_price$MSRP.in.CA), ]

#Question no. 3 - scatter plot using ggplot2
library(ggplot2)

#create a scatter plot of MSRP vs City MPG : MSRP - Y axis and City MPG - X axis
ggplot(car_price, aes(x=City.MPG, y=MSRP.in.CA)) + geom_point() + ggtitle("MSRP vs City MPG")

#add the two categorical variables, "Car.Category" and "Country of Origin," to the plot
ggplot(car_price, aes(x=City.MPG, y=MSRP.in.CA, color=Car.Category, shape=Country.of.Origin)) + geom_point() + ggtitle("MSRP vs City MPG")

#Create a scatter plot of MSRP vs Highway MPG : MSRP - Y axis and Highway MPG - X axis
ggplot(car_price, aes(x=Highway.MPG, y=MSRP.in.CA)) + geom_point() + ggtitle("MSRP vs Highway MPG")

#add the two categorical variables, "Car.Category" and "Country of Origin," to the plot
ggplot(car_price, aes(x=Highway.MPG, y=MSRP.in.CA, color=Car.Category, shape=Country.of.Origin)) + geom_point() + ggtitle("MSRP vs Highway MPG")

#Question 4 - box plot using ggplot2 and boxplot()
#Create side-by-side box plots of MSRP across different categories
ggplot(car_price, aes(x=Car.Category, y=MSRP.in.CA)) + geom_boxplot() + ggtitle("MSRP by Car Category")

#using boxplot()
boxplot(car_price$MSRP ~ car_price$Country.of.Origin, main="Box plot of MSRP across different categories", xlab="Country of Origin", ylab="MSRP")

# add interpretations
#The box plot shows the distribution of MSRP values across different car categories. The box represents the interquartile range (IQR) of the data, with the median value indicated by the horizontal line inside the box. The whiskers extend to the minimum and maximum values within 1.5 times the IQR from the lower and upper quartiles, respectively. Outliers are represented as individual points outside the whiskers.

#Create side-by-side box plots of City MPG across different categories
ggplot(car_price, aes(x=Car.Category, y=City.MPG)) + geom_boxplot() + ggtitle("City MPG by Car Category")

#using boxplot() to create the side-by-side box plots
boxplot(car_price$City.MPG ~ car_price$Country.of.Origin, main="Box plot of City MPG across different categories", xlab="Country of Origin", ylab="City MPG")

#add interpretations
#The box plot shows the distribution of City MPG values across different car categories. The box represents the interquartile range (IQR) of the data, with the median value indicated by the horizontal line inside the box. The whiskers extend to the minimum and maximum values within 1.5 times the IQR from the lower and upper quartiles, respectively. Outliers are represented as individual points outside the whiskers.

#Question 5 - Create a pivot table reporting the average MSRP and average City MPG across different categories
# Create a pivot table reporting the average MSRP and average City MPG across different categories
pivot_table <- aggregate(car_price[, c("MSRP.in.CA", "City.MPG")], by=list(car_price$Country.of.Origin), FUN=mean)
colnames(pivot_table) <- c("Country.of.Origin", "Average.MSRP", "Average.City.MPG")
pivot_table
##   Country.of.Origin Average.MSRP Average.City.MPG
## 1           Germany     34328.57         25.00000
## 2             Italy     43900.00         24.00000
## 3             Japan     30324.00         25.44000
## 4       South Korea     28766.67         25.88889
## 5            Sweden     41900.00         25.00000
## 6     United States     30529.41         23.29412
# Provide an appropriate visualization of the patterns you observe.
barplot(pivot_table$Average.MSRP, main="Average MSRP across different categories", xlab="Country of Origin", ylab="Average MSRP", col=rainbow(6), legend=pivot_table$Country.of.Origin)