#Exploring and Understanding Data with R
#Data exploration example using used car data
getwd()
[1] "/cloud/project"
# Load the CSV file
usedcars <- read.csv("usedcars.csv", stringsAsFactors = FALSE)
# get structure of used car data
str(usedcars)
'data.frame':   150 obs. of  6 variables:
 $ year        : int  2011 2011 2011 2011 2012 2010 2011 2010 2011 2010 ...
 $ model       : chr  "SEL" "SEL" "SEL" "SEL" ...
 $ price       : int  21992 20995 19995 17809 17500 17495 17000 16995 16995 16995 ...
 $ mileage     : int  7413 10926 7351 11613 8367 25125 27393 21026 32655 36116 ...
 $ color       : chr  "Yellow" "Gray" "Silver" "Gray" ...
 $ transmission: chr  "AUTO" "AUTO" "AUTO" "AUTO" ...
#Exploring numeric variables
# summarize numeric variables
summary(usedcars$year)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   2000    2008    2009    2009    2010    2012 
summary(usedcars[c("price", "mileage")])
     price          mileage      
 Min.   : 3800   Min.   :  4867  
 1st Qu.:10995   1st Qu.: 27200  
 Median :13592   Median : 36385  
 Mean   :12962   Mean   : 44261  
 3rd Qu.:14904   3rd Qu.: 55124  
 Max.   :21992   Max.   :151479  
# calculate the mean income
(36000 + 44000 + 56000) / 3
[1] 45333.33
mean(c(36000, 44000, 56000))
[1] 45333.33
# the median income
median(c(36000, 44000, 56000))
[1] 44000
# the min/max of used car prices
range(usedcars$price)
[1]  3800 21992
# the difference of the range
diff(range(usedcars$price))
[1] 18192
# IQR for used car prices
IQR(usedcars$price)
[1] 3909.5
# use quantile to calculate five-number summary
quantile(usedcars$price)
     0%     25%     50%     75%    100% 
 3800.0 10995.0 13591.5 14904.5 21992.0 
# the 99th percentile
quantile(usedcars$price, probs = c(0.01, 0.99))
      1%      99% 
 5428.69 20505.00 
# quintiles
quantile(usedcars$price, seq(from = 0, to = 1, by = 0.20))
     0%     20%     40%     60%     80%    100% 
 3800.0 10759.4 12993.8 13992.0 14999.0 21992.0 
# boxplot of used car prices and mileage
boxplot(usedcars$price, main="Boxplot of Used Car Prices",
      ylab="Price ($)")

# This boxplot visually represents the distribution of used car prices in the dataset, highlighting key statistical features and potential outliers. The rectangular box illustrates the interquartile range (IQR), which contains the central/middle 50% of the data, while the line inside the box marks the median price. The whiskers extend to values that fall within 1.5 times the IQR, and any points beyond this range are considered outliers, appearing as individual dots. These outliers indicate cars priced significantly above or below the typical range, possibly due to differences in factors such as mileage, brand, model year, or condition. By analyzing this distribution, we can better understand pricing patterns and identify unusual price variations within the dataset.
boxplot(usedcars$mileage, main="Boxplot of Used Car Mileage",
      ylab="Odometer (mi.)")

#Here the boxplot displays the distribution of used car mileage, providing insight into how mileage varies among the vehicles in the dataset. The box represents the interquartile range (IQR), which contains the middle 50% of the data, while the line inside the box indicates the median mileage. The whiskers extend to values within 1.5 times the IQR, and any data points beyond this range are considered outliers, appearing as individual dots. These outliers suggest that some vehicles have significantly higher or lower mileage than the majority, which could be due to differences in usage, age, or maintenance history. A right-skewed distribution, if present, would indicate that most used cars have moderate mileage, with a few having exceptionally high mileage. Understanding this distribution can help assess the typical wear and tear on vehicles and identify any anomalies in the dataset.
# histograms of used car prices and mileage
hist(usedcars$price, main = "Histogram of Used Car Prices",
     xlab = "Price ($)")

#This histogram illustrates the distribution of used car prices, showing how prices are spread across different ranges. The x-axis represents price values grouped into bins, while the y-axis indicates the number of cars within each price range. This visualization helps identify trends such as the central tendency, where most prices are concentrated, and the overall spread of values. If the histogram is right-skewed, it suggests that a majority of used cars are priced lower, with fewer high-priced vehicles pulling the distribution to the right. Additionally, gaps or clusters in certain price ranges may indicate pricing patterns influenced by factors such as vehicle make, model, or condition. By analyzing this distribution, we can gain a better understanding of market trends and price variability in the dataset.
hist(usedcars$mileage, main = "Histogram of Used Car Mileage",
     xlab = "Odometer (mi.)")

#This histogram represents the distribution of used car mileage, illustrating how odometer readings are spread across different ranges. The x-axis groups mileage values into bins, while the y-axis shows the number of cars within each range. This visualization helps identify patterns such as central tendency and spread, revealing whether most cars have low, moderate, or high mileage. Since the histogram is right-skewed, it indicates that the majority of vehicles have relatively lower mileage, with fewer high-mileage cars extending the distribution to the right.
# variance and standard deviation of the used car data
var(usedcars$price)
[1] 9749892
sd(usedcars$price)
[1] 3122.482
var(usedcars$mileage)
[1] 728033954
sd(usedcars$mileage)
[1] 26982.1
#Exploring numeric variables
# one-way tables for the used car data
table(usedcars$year)

2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 
   3    1    1    1    3    2    6   11   14   42   49   16    1 
table(usedcars$model)

 SE SEL SES 
 78  23  49 
table(usedcars$color)

 Black   Blue   Gold   Gray  Green    Red Silver  White Yellow 
    35     17      1     16      5     25     32     16      3 
# compute table proportions
model_table <- table(usedcars$model)
prop.table(model_table)

       SE       SEL       SES 
0.5200000 0.1533333 0.3266667 
# round the data
color_table <- table(usedcars$color)
color_pct <- prop.table(color_table) * 100
round(color_pct, digits = 1)

 Black   Blue   Gold   Gray  Green    Red Silver  White Yellow 
  23.3   11.3    0.7   10.7    3.3   16.7   21.3   10.7    2.0 
#Exploring relationships between variables 
# scatterplot of price vs. mileage
plot(x = usedcars$mileage, y = usedcars$price,
     main = "Scatterplot of Price vs. Mileage",
     xlab = "Used Car Odometer (mi.)",
     ylab = "Used Car Price ($)")

#This scatterplot visualizes the relationship between used car mileage and price, with each point representing an individual car. The x-axis displays odometer readings in miles, while the y-axis represents prices in dollars. The plot reveals a negative correlation, suggesting that as mileage increases, the price of the vehicle generally decreases. This trend aligns with expectations, as higher mileage often indicates more wear and tear, reducing a car’s market value. However, the spread of points may also indicate variability due to other factors such as brand, model, condition, and maintenance history. Outliers, if present, could represent rare cases where high-mileage cars are still priced high due to premium features or exceptional upkeep. Understanding this relationship helps in assessing how mileage impacts used car pricing trends.
# new variable indicating conservative colors
usedcars$conservative <-
  usedcars$color %in% c("Black", "Gray", "Silver", "White")
# checking our variable
table(usedcars$conservative)

FALSE  TRUE 
   51    99 
install.packages("gmodels")
Installing package into ‘/cloud/lib/x86_64-pc-linux-gnu-library/4.4’
(as ‘lib’ is unspecified)
trying URL 'http://rspm/default/__linux__/focal/latest/src/contrib/gmodels_2.19.1.tar.gz'
Content type 'application/x-gzip' length 115780 bytes (113 KB)
==================================================
downloaded 113 KB

* installing *binary* package ‘gmodels’ ...
* DONE (gmodels)

The downloaded source packages are in
    ‘/tmp/RtmpysNK1H/downloaded_packages’
# Crosstab of conservative by model
library(gmodels)
CrossTable(x = usedcars$model, y = usedcars$conservative)

 
   Cell Contents
|-------------------------|
|                       N |
| Chi-square contribution |
|           N / Row Total |
|           N / Col Total |
|         N / Table Total |
|-------------------------|

 
Total Observations in Table:  150 

 
               | usedcars$conservative 
usedcars$model |     FALSE |      TRUE | Row Total | 
---------------|-----------|-----------|-----------|
            SE |        27 |        51 |        78 | 
               |     0.009 |     0.004 |           | 
               |     0.346 |     0.654 |     0.520 | 
               |     0.529 |     0.515 |           | 
               |     0.180 |     0.340 |           | 
---------------|-----------|-----------|-----------|
           SEL |         7 |        16 |        23 | 
               |     0.086 |     0.044 |           | 
               |     0.304 |     0.696 |     0.153 | 
               |     0.137 |     0.162 |           | 
               |     0.047 |     0.107 |           | 
---------------|-----------|-----------|-----------|
           SES |        17 |        32 |        49 | 
               |     0.007 |     0.004 |           | 
               |     0.347 |     0.653 |     0.327 | 
               |     0.333 |     0.323 |           | 
               |     0.113 |     0.213 |           | 
---------------|-----------|-----------|-----------|
  Column Total |        51 |        99 |       150 | 
               |     0.340 |     0.660 |           | 
---------------|-----------|-----------|-----------|

 
---
title: "Exploring and Understanding Data with R"
output: html_notebook
---

```{r}
#Exploring and Understanding Data with R
#Data exploration example using used car data
getwd()
# Load the CSV file
usedcars <- read.csv("usedcars.csv", stringsAsFactors = FALSE)
# get structure of used car data
str(usedcars)
#Exploring numeric variables
# summarize numeric variables
summary(usedcars$year)
summary(usedcars[c("price", "mileage")])
# calculate the mean income
(36000 + 44000 + 56000) / 3
mean(c(36000, 44000, 56000))
# the median income
median(c(36000, 44000, 56000))
# the min/max of used car prices
range(usedcars$price)
# the difference of the range
diff(range(usedcars$price))
# IQR for used car prices
IQR(usedcars$price)
# use quantile to calculate five-number summary
quantile(usedcars$price)
# the 99th percentile
quantile(usedcars$price, probs = c(0.01, 0.99))
# quintiles
quantile(usedcars$price, seq(from = 0, to = 1, by = 0.20))
# boxplot of used car prices and mileage
boxplot(usedcars$price, main="Boxplot of Used Car Prices",
      ylab="Price ($)")
# This boxplot visually represents the distribution of used car prices in the dataset, highlighting key statistical features and potential outliers. The rectangular box illustrates the interquartile range (IQR), which contains the central/middle 50% of the data, while the line inside the box marks the median price. The whiskers extend to values that fall within 1.5 times the IQR, and any points beyond this range are considered outliers, appearing as individual dots. These outliers indicate cars priced significantly above or below the typical range, possibly due to differences in factors such as mileage, brand, model year, or condition. By analyzing this distribution, we can better understand pricing patterns and identify unusual price variations within the dataset.
boxplot(usedcars$mileage, main="Boxplot of Used Car Mileage",
      ylab="Odometer (mi.)")
#Here the boxplot displays the distribution of used car mileage, providing insight into how mileage varies among the vehicles in the dataset. The box represents the interquartile range (IQR), which contains the middle 50% of the data, while the line inside the box indicates the median mileage. The whiskers extend to values within 1.5 times the IQR, and any data points beyond this range are considered outliers, appearing as individual dots. These outliers suggest that some vehicles have significantly higher or lower mileage than the majority, which could be due to differences in usage, age, or maintenance history. A right-skewed distribution, if present, would indicate that most used cars have moderate mileage, with a few having exceptionally high mileage. Understanding this distribution can help assess the typical wear and tear on vehicles and identify any anomalies in the dataset.
# histograms of used car prices and mileage
hist(usedcars$price, main = "Histogram of Used Car Prices",
     xlab = "Price ($)")
#This histogram illustrates the distribution of used car prices, showing how prices are spread across different ranges. The x-axis represents price values grouped into bins, while the y-axis indicates the number of cars within each price range. This visualization helps identify trends such as the central tendency, where most prices are concentrated, and the overall spread of values. If the histogram is right-skewed, it suggests that a majority of used cars are priced lower, with fewer high-priced vehicles pulling the distribution to the right. Additionally, gaps or clusters in certain price ranges may indicate pricing patterns influenced by factors such as vehicle make, model, or condition. By analyzing this distribution, we can gain a better understanding of market trends and price variability in the dataset.
hist(usedcars$mileage, main = "Histogram of Used Car Mileage",
     xlab = "Odometer (mi.)")
#This histogram represents the distribution of used car mileage, illustrating how odometer readings are spread across different ranges. The x-axis groups mileage values into bins, while the y-axis shows the number of cars within each range. This visualization helps identify patterns such as central tendency and spread, revealing whether most cars have low, moderate, or high mileage. Since the histogram is right-skewed, it indicates that the majority of vehicles have relatively lower mileage, with fewer high-mileage cars extending the distribution to the right.
# variance and standard deviation of the used car data
var(usedcars$price)
sd(usedcars$price)
var(usedcars$mileage)
sd(usedcars$mileage)
#Exploring numeric variables
# one-way tables for the used car data
table(usedcars$year)
table(usedcars$model)
table(usedcars$color)
# compute table proportions
model_table <- table(usedcars$model)
prop.table(model_table)
# round the data
color_table <- table(usedcars$color)
color_pct <- prop.table(color_table) * 100
round(color_pct, digits = 1)
#Exploring relationships between variables 
# scatterplot of price vs. mileage
plot(x = usedcars$mileage, y = usedcars$price,
     main = "Scatterplot of Price vs. Mileage",
     xlab = "Used Car Odometer (mi.)",
     ylab = "Used Car Price ($)")
#This scatterplot visualizes the relationship between used car mileage and price, with each point representing an individual car. The x-axis displays odometer readings in miles, while the y-axis represents prices in dollars. The plot reveals a negative correlation, suggesting that as mileage increases, the price of the vehicle generally decreases. This trend aligns with expectations, as higher mileage often indicates more wear and tear, reducing a car’s market value. However, the spread of points may also indicate variability due to other factors such as brand, model, condition, and maintenance history. Outliers, if present, could represent rare cases where high-mileage cars are still priced high due to premium features or exceptional upkeep. Understanding this relationship helps in assessing how mileage impacts used car pricing trends.
# new variable indicating conservative colors
usedcars$conservative <-
  usedcars$color %in% c("Black", "Gray", "Silver", "White")
# checking our variable
table(usedcars$conservative)
install.packages("gmodels")
# Crosstab of conservative by model
library(gmodels)
CrossTable(x = usedcars$model, y = usedcars$conservative)
```

