Assn 1: Introduction to Data Analytics

R Markdown

library(ISLR)

## Warning: package 'ISLR' was built under R version 4.3.2

library(ggplot2)

data(Auto)

Auto1<- any(is.na(Auto))
if(Auto1){
 cat("there are missing values")
}
{
 cat("No missing values")
}

## No missing values

Q9 a. Quantitative and Qualitative predictors

str(Auto)

## 'data.frame':    392 obs. of  9 variables:
##  $ mpg         : num  18 15 18 16 17 15 14 14 14 15 ...
##  $ cylinders   : num  8 8 8 8 8 8 8 8 8 8 ...
##  $ displacement: num  307 350 318 304 302 429 454 440 455 390 ...
##  $ horsepower  : num  130 165 150 150 140 198 220 215 225 190 ...
##  $ weight      : num  3504 3693 3436 3433 3449 ...
##  $ acceleration: num  12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
##  $ year        : num  70 70 70 70 70 70 70 70 70 70 ...
##  $ origin      : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ name        : Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2 ...

print("The following are quantitative predictors: mpg, cylinders, displacement, horsepower, weig
ht, acceleration, year, origin")

## [1] "The following are quantitative predictors: mpg, cylinders, displacement, horsepower, weig\nht, acceleration, year, origin"

print("The following are qualitative predictors: name")

## [1] "The following are qualitative predictors: name"

Q9 b. Range of each quantitative predictor

quantitative_predictors <- sapply(Auto, is.numeric)
# Geting the range for each quantitative predictor
quantitative_ranges <- sapply(Auto[, quantitative_predictors], range)
# Printing the range for each quantitative predictor
cat("Range of Quantitative Predictors:\n")
print(quantitative_ranges)

Q9 c.Mean and standard deviation of each quantitative predictor

cat("Mean of mpg and Std. Deviation of mpg are",mean(Auto$mpg) ,"and", sd(Auto$mpg))
cat("\n")
cat("Mean of cylinders and Std. Deviation of cylinders are",mean(Auto$cylinders), "and", sd(Auto
$cylinders))
cat("Mean of displacement and Std. Deviation of displacement are",mean(Auto$displacement), "an
d", sd(Auto$displacement))
cat("Mean of horsepower and Std. Deviation of horsepower",mean(Auto$horsepower),"and", sd(Auto$h
orsepower))
cat("Mean of weight and Std. Deviation of weight",mean(Auto$weight),"and", sd(Auto$weight))
cat("Mean of acceleration and Std. Deviation of acceleration",mean(Auto$acceleration),"and", sd
(Auto$acceleration))
cat("Mean of year and Std. Deviation of year",mean(Auto$year),"and", sd(Auto$year))
cat("Mean of origin and Std. Deviation of origin",mean(Auto$origin),"and", sd(Auto$origin))

Q9 d.Now remove the 10th through 85th observations. What is the range, mean, and standard deviation of each predictor in the subset of the data that remains?

# Assuming 'Auto' is your data frame
subset_auto <- Auto[-(10:85), ]
# Calculate range, mean, and standard deviation for each predictor in the subset
range_subset_mpg <- range(subset_auto$mpg)
mean_subset_mpg <- mean(subset_auto$mpg)
sd_subset_mpg <- sd(subset_auto$mpg)
range_subset_cylinders <- range(subset_auto$cylinders)
mean_subset_cylinders <- mean(subset_auto$cylinders)
sd_subset_cylinders <- sd(subset_auto$cylinders)
range_subset_displacement <- range(subset_auto$displacement)
mean_subset_displacement <- mean(subset_auto$displacement)
sd_subset_displacement <- sd(subset_auto$displacement)
range_subset_horsepower <- range(subset_auto$horsepower)
mean_subset_horsepower <- mean(subset_auto$horsepower)
sd_subset_horsepower <- sd(subset_auto$horsepower)
range_subset_weight <- range(subset_auto$weight)
mean_subset_weight <- mean(subset_auto$weight)
sd_subset_weight <- sd(subset_auto$weight)
range_subset_acceleration <- range(subset_auto$acceleration)
mean_subset_acceleration <- mean(subset_auto$acceleration)
sd_subset_acceleration <- sd(subset_auto$acceleration)
range_subset_year <- range(subset_auto$year)
mean_subset_year <- mean(subset_auto$year)
sd_subset_year <- sd(subset_auto$year)
# Display the results
cat("Range, Mean, and Standard Deviation for mpg in the subset:", range_subset_mpg, mean_subset_mpg, sd_subset_mpg, "\n")

## Range, Mean, and Standard Deviation for mpg in the subset: 11 46.6 24.40443 7.867283

cat("Range, Mean, and Standard Deviation for cylinders in the subset:", range_subset_cylinders,
mean_subset_cylinders, sd_subset_cylinders, "\n")

## Range, Mean, and Standard Deviation for cylinders in the subset: 3 8 5.373418 1.654179

cat("Range, Mean, and Standard Deviation for displacement in the subset:", range_subset_displacement, mean_subset_displacement, sd_subset_displacement, "\n")

## Range, Mean, and Standard Deviation for displacement in the subset: 68 455 187.2405 99.67837

cat("Range, Mean, and Standard Deviation for horsepower in the subset:", range_subset_horsepower, mean_subset_horsepower, sd_subset_horsepower, "\n")

## Range, Mean, and Standard Deviation for horsepower in the subset: 46 230 100.7215 35.70885

cat("Range, Mean, and Standard Deviation for weight in the subset:", range_subset_weight, mean_subset_weight, sd_subset_weight, "\n")

## Range, Mean, and Standard Deviation for weight in the subset: 1649 4997 2935.972 811.3002

cat("Range, Mean, and Standard Deviation for acceleration in the subset:", range_subset_acceleration, mean_subset_acceleration, sd_subset_acceleration, "\n")

## Range, Mean, and Standard Deviation for acceleration in the subset: 8.5 24.8 15.7269 2.693721

cat("Range, Mean, and Standard Deviation for year in the subset:", range_subset_year, mean_subset_year, sd_subset_year, "\n")

## Range, Mean, and Standard Deviation for year in the subset: 70 82 77.14557 3.106217

Q9 e.Using the full data set, investigate the predictors graphically, using scatterplots or other tools of your choice. Create some plots highlighting the relationships among the predictors. Comment on your findings.

# Scatterplot for mpg vs. weight
ggplot(Auto, aes(x = weight, y = mpg)) +
 geom_point() +
 labs(title = "Scatterplot of MPG vs. Weight",
 x = "Weight",
 y = "MPG") +
 theme_classic()

# Scatterplot for horsepower vs. acceleration
ggplot(Auto, aes(x = horsepower, y = acceleration)) +
 geom_point() +
 labs(title = "Scatterplot of Horsepower vs. Acceleration",
 x = "Horsepower",
 y = "Acceleration") +
 theme_classic()

# Scatterplot matrix for multiple predictors
ggplot(Auto, aes(x = horsepower, y = mpg)) +
 geom_point() +
 facet_wrap(~cylinders) +
 labs(title = "Scatterplot Matrix of MPG vs. Horsepower (Faceted by Cylinders)",
 x = "Horsepower",
 y = "MPG") +
 theme_classic()

Main trend here: Negative correlation btw weight and mileage Q9 f.Suppose that we wish to predict gas mileage (mpg) on the basis of the other variables. Do your plots suggest that any of the other variables might be useful in predicting mpg? Justify your answer.

When evaluating the potential predictive value of various variables in relation to gas mileage (MPG), an analysis of scatterplots reveals certain observations:

In the context of MPG vs. Weight, a discernible negative correlation is evident. Lighter vehicles generally exhibit higher MPG, suggesting that weight could be a valuable predictor of fuel efficiency.

Turning attention to the relationship between Horsepower and Acceleration, no clear pattern or trend emerges. This lack of a distinct association implies that acceleration may not serve as a robust predictor of MPG.

A more nuanced perspective is gained when examining MPG vs. Horsepower, particularly when faceted by Cylinders. In certain cylinder categories, a negative correlation between horsepower and MPG is observed. The scatterplot matrix offers insights into the variability of this relationship across different cylinder levels.

Considering the interaction with the cylinders variable, Horsepower emerges as a potentially valuable predictor of MPG. The negative relationship between horsepower and MPG, especially within specific cylinder categories, suggests its potential utility in predicting fuel efficiency.

Q10

library(ISLR2)

## Warning: package 'ISLR2' was built under R version 4.3.2

## 
## Attaching package: 'ISLR2'

## The following object is masked _by_ '.GlobalEnv':
## 
##     Auto

## The following objects are masked from 'package:ISLR':
## 
##     Auto, Credit

Q10 a.Number of rows and columns of boston dataset?

summary(Boston)

##       crim                zn             indus            chas        
##  Min.   : 0.00632   Min.   :  0.00   Min.   : 0.46   Min.   :0.00000  
##  1st Qu.: 0.08205   1st Qu.:  0.00   1st Qu.: 5.19   1st Qu.:0.00000  
##  Median : 0.25651   Median :  0.00   Median : 9.69   Median :0.00000  
##  Mean   : 3.61352   Mean   : 11.36   Mean   :11.14   Mean   :0.06917  
##  3rd Qu.: 3.67708   3rd Qu.: 12.50   3rd Qu.:18.10   3rd Qu.:0.00000  
##  Max.   :88.97620   Max.   :100.00   Max.   :27.74   Max.   :1.00000  
##       nox               rm             age              dis        
##  Min.   :0.3850   Min.   :3.561   Min.   :  2.90   Min.   : 1.130  
##  1st Qu.:0.4490   1st Qu.:5.886   1st Qu.: 45.02   1st Qu.: 2.100  
##  Median :0.5380   Median :6.208   Median : 77.50   Median : 3.207  
##  Mean   :0.5547   Mean   :6.285   Mean   : 68.57   Mean   : 3.795  
##  3rd Qu.:0.6240   3rd Qu.:6.623   3rd Qu.: 94.08   3rd Qu.: 5.188  
##  Max.   :0.8710   Max.   :8.780   Max.   :100.00   Max.   :12.127  
##       rad              tax           ptratio          lstat      
##  Min.   : 1.000   Min.   :187.0   Min.   :12.60   Min.   : 1.73  
##  1st Qu.: 4.000   1st Qu.:279.0   1st Qu.:17.40   1st Qu.: 6.95  
##  Median : 5.000   Median :330.0   Median :19.05   Median :11.36  
##  Mean   : 9.549   Mean   :408.2   Mean   :18.46   Mean   :12.65  
##  3rd Qu.:24.000   3rd Qu.:666.0   3rd Qu.:20.20   3rd Qu.:16.95  
##  Max.   :24.000   Max.   :711.0   Max.   :22.00   Max.   :37.97  
##       medv      
##  Min.   : 5.00  
##  1st Qu.:17.02  
##  Median :21.20  
##  Mean   :22.53  
##  3rd Qu.:25.00  
##  Max.   :50.00

Q10 b. Make some pairwise scatterplots of the predictors (columns) in this data set. Describe your findings.

pairs(Boston)

We can observe that there are correlations between variables. Q9 c. Are any of the predictors associated with per capita crime rate? If so, explain the relationship.

result_df <- data.frame(i = character(), j = character(), cor = numeric(), p = numeric(), stringsAsFactors = FALSE)
for (i in colnames(Boston)) {
 for (j in colnames(Boston)) {
 if (i != j && i=='crim') {
 correlation_test <- cor.test(Boston[[i]], Boston[[j]])
 result_df <- rbind(result_df, data.frame(i = i, j = j, cor = correlation_test$estimate, p
= correlation_test$p.value))
 }
 }
}
# Print the result data frame
print(result_df)

##          i       j         cor            p
## cor   crim      zn -0.20046922 5.506472e-06
## cor1  crim   indus  0.40658341 1.450349e-21
## cor2  crim    chas -0.05589158 2.094345e-01
## cor3  crim     nox  0.42097171 3.751739e-23
## cor4  crim      rm -0.21924670 6.346703e-07
## cor5  crim     age  0.35273425 2.854869e-16
## cor6  crim     dis -0.37967009 8.519949e-19
## cor7  crim     rad  0.62550515 2.693844e-56
## cor8  crim     tax  0.58276431 2.357127e-47
## cor9  crim ptratio  0.28994558 2.942922e-11
## cor10 crim   lstat  0.45562148 2.654277e-27
## cor11 crim    medv -0.38830461 1.173987e-19

There is a direct relation between per capita income and crime rate.

Q10 d. Do any of the census tracts of Boston appear to have particularly high crime rates? Tax rates? Pupil-teacher ratios? Comment on the range of each predictor.

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

# Create boxplots for selected predictors
selected_predictors <- c("crim", "tax", "ptratio")
# Boxplot for crime rates
ggplot(Boston, aes(x = 1, y = crim)) +
 geom_boxplot() +
 labs(title = "Boxplot of Crime Rates",
 x = "",
 y = "Crime Rate")

# Boxplot for tax rates
ggplot(Boston, aes(x = 1, y = tax)) +
 geom_boxplot() +
 labs(title = "Boxplot of Tax Rates",
 x = "",
 y = "Tax Rate")

# Boxplot for pupil-teacher ratios
ggplot(Boston, aes(x = 1, y = ptratio)) +
 geom_boxplot() +
 labs(title = "Boxplot of Pupil-Teacher Ratios",
 x = "",
 y = "Pupil-Teacher Ratio")

1.Indications from the boxplot point to relatively modest crime rates in the city, with a few instances that deviate from the norm.

2.The boxplot implies that tax rates in Boston are generally on the lower side, showcasing a scattering of outliers both at the lower and higher extremes.

3.A direct association is noticeable between the number of pupils and teachers, as depicted by the upward trend in the boxplots. Schools boasting more than 20 pupils tend to exhibit a higher pupil-teacher ratio compared to those with fewer pupils. This distinction is evident in the positioning of the boxplots, with those for schools with more than 20 pupils positioned to the right. Furthermore, there is a broader range of pupil-teacher ratios in larger schools, as evidenced by the wider spread of the boxplots in comparison to smaller schools.

Q10 e. How many of the census tracts in this data set bound the Charles river?

sum(Boston$chas == 1)

## [1] 35

Q10 f. What is the median pupil-teacher ratio among the towns in this data set?

cat("median pupil-teacher ratio among the towns :", median(Boston$ptratio))

## median pupil-teacher ratio among the towns : 19.05

Q10 g. Which census tract of Boston has lowest median value of owneroccupied homes? What are the values of the other predictors for that census tract, and how do those values compare to the overall ranges for those predictors? Comment on your findings.

selection <- Boston[order(Boston$medv),]
selection[1,]

##        crim zn indus chas   nox    rm age    dis rad tax ptratio lstat medv
## 399 38.3518  0  18.1    0 0.693 5.453 100 1.4896  24 666    20.2 30.59    5

Suburb 399 holds the least median value among owner-occupied residences.

summary(selection)

##       crim                zn             indus            chas        
##  Min.   : 0.00632   Min.   :  0.00   Min.   : 0.46   Min.   :0.00000  
##  1st Qu.: 0.08205   1st Qu.:  0.00   1st Qu.: 5.19   1st Qu.:0.00000  
##  Median : 0.25651   Median :  0.00   Median : 9.69   Median :0.00000  
##  Mean   : 3.61352   Mean   : 11.36   Mean   :11.14   Mean   :0.06917  
##  3rd Qu.: 3.67708   3rd Qu.: 12.50   3rd Qu.:18.10   3rd Qu.:0.00000  
##  Max.   :88.97620   Max.   :100.00   Max.   :27.74   Max.   :1.00000  
##       nox               rm             age              dis        
##  Min.   :0.3850   Min.   :3.561   Min.   :  2.90   Min.   : 1.130  
##  1st Qu.:0.4490   1st Qu.:5.886   1st Qu.: 45.02   1st Qu.: 2.100  
##  Median :0.5380   Median :6.208   Median : 77.50   Median : 3.207  
##  Mean   :0.5547   Mean   :6.285   Mean   : 68.57   Mean   : 3.795  
##  3rd Qu.:0.6240   3rd Qu.:6.623   3rd Qu.: 94.08   3rd Qu.: 5.188  
##  Max.   :0.8710   Max.   :8.780   Max.   :100.00   Max.   :12.127  
##       rad              tax           ptratio          lstat      
##  Min.   : 1.000   Min.   :187.0   Min.   :12.60   Min.   : 1.73  
##  1st Qu.: 4.000   1st Qu.:279.0   1st Qu.:17.40   1st Qu.: 6.95  
##  Median : 5.000   Median :330.0   Median :19.05   Median :11.36  
##  Mean   : 9.549   Mean   :408.2   Mean   :18.46   Mean   :12.65  
##  3rd Qu.:24.000   3rd Qu.:666.0   3rd Qu.:20.20   3rd Qu.:16.95  
##  Max.   :24.000   Max.   :711.0   Max.   :22.00   Max.   :37.97  
##       medv      
##  Min.   : 5.00  
##  1st Qu.:17.02  
##  Median :21.20  
##  Mean   :22.53  
##  3rd Qu.:25.00  
##  Max.   :50.00

Neighborhood #399 exhibits an exceptionally elevated crime rate in stark contrast to the median and average rates across all Boston neighborhoods. Additionally, over 50% of Boston neighborhoods share the characteristic of lacking residential land zoned for lots exceeding 25,000 sq.ft. In comparison to the majority of suburbs, this area stands out for its notably high proportion of non-retail business acres per town. It’s worth noting that this suburb does not fall within the category of those bordering the Charles River. Furthermore, the concentration of nitrogen oxides, measured in parts per 10 million, ranks among the highest in this particular neighborhood.

Q10 h. In this data set, how many of the census tracts average more than seven rooms per dwelling? More than eight rooms per dwelling? Comment on the census tracts that average more than eight rooms per dwelling.

rm_over_7 <- subset(Boston, rm>7)
nrow(rm_over_7)

## [1] 64

There are 64 suburbs having more than 7 rooms per dwelling!

rm_over_8 <- subset(Boston, rm>8)
nrow(rm_over_8)

## [1] 13

There are 13 suburbs having more than 8 rooms per dwelling

summary(rm_over_8)

##       crim               zn            indus             chas       
##  Min.   :0.02009   Min.   : 0.00   Min.   : 2.680   Min.   :0.0000  
##  1st Qu.:0.33147   1st Qu.: 0.00   1st Qu.: 3.970   1st Qu.:0.0000  
##  Median :0.52014   Median : 0.00   Median : 6.200   Median :0.0000  
##  Mean   :0.71879   Mean   :13.62   Mean   : 7.078   Mean   :0.1538  
##  3rd Qu.:0.57834   3rd Qu.:20.00   3rd Qu.: 6.200   3rd Qu.:0.0000  
##  Max.   :3.47428   Max.   :95.00   Max.   :19.580   Max.   :1.0000  
##       nox               rm             age             dis       
##  Min.   :0.4161   Min.   :8.034   Min.   : 8.40   Min.   :1.801  
##  1st Qu.:0.5040   1st Qu.:8.247   1st Qu.:70.40   1st Qu.:2.288  
##  Median :0.5070   Median :8.297   Median :78.30   Median :2.894  
##  Mean   :0.5392   Mean   :8.349   Mean   :71.54   Mean   :3.430  
##  3rd Qu.:0.6050   3rd Qu.:8.398   3rd Qu.:86.50   3rd Qu.:3.652  
##  Max.   :0.7180   Max.   :8.780   Max.   :93.90   Max.   :8.907  
##       rad              tax           ptratio          lstat           medv     
##  Min.   : 2.000   Min.   :224.0   Min.   :13.00   Min.   :2.47   Min.   :21.9  
##  1st Qu.: 5.000   1st Qu.:264.0   1st Qu.:14.70   1st Qu.:3.32   1st Qu.:41.7  
##  Median : 7.000   Median :307.0   Median :17.40   Median :4.14   Median :48.3  
##  Mean   : 7.462   Mean   :325.1   Mean   :16.36   Mean   :4.31   Mean   :44.2  
##  3rd Qu.: 8.000   3rd Qu.:307.0   3rd Qu.:17.40   3rd Qu.:5.12   3rd Qu.:50.0  
##  Max.   :24.000   Max.   :666.0   Max.   :20.20   Max.   :7.44   Max.   :50.0

Census tracts with an average of more than eight rooms per dwelling typically exhibit characteristics such as higher affluence, an older population, and larger residences. Concurrently, these areas demonstrate a heightened level of safety and a reduced presence of industrial activities.

Assn 1: Introduction to Data Analytics

Magesh G

2024-01-26

R Markdown