R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

library(mlbench)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
#1.PREPARE DATASET

AirQualityDataset <- read.csv("~/Downloads/Air_Quality(in).csv", stringsAsFactors=TRUE)
AirQualityDataset_Not_Altered <- AirQualityDataset
View(AirQualityDataset_Not_Altered)
head(AirQualityDataset)
##   Unique.ID Indicator.ID                   Name Measure Measure.Info
## 1    336867          375 Nitrogen dioxide (NO2)    Mean          ppb
## 2    336741          375 Nitrogen dioxide (NO2)    Mean          ppb
## 3    550157          375 Nitrogen dioxide (NO2)    Mean          ppb
## 4    412802          375 Nitrogen dioxide (NO2)    Mean          ppb
## 5    412803          375 Nitrogen dioxide (NO2)    Mean          ppb
## 6    412676          375 Nitrogen dioxide (NO2)    Mean          ppb
##   Geo.Type.Name Geo.Join.ID                    Geo.Place.Name
## 1            CD         407     Flushing and Whitestone (CD7)
## 2            CD         107             Upper West Side (CD7)
## 3            CD         414 Rockaway and Broad Channel (CD14)
## 4            CD         407     Flushing and Whitestone (CD7)
## 5            CD         407     Flushing and Whitestone (CD7)
## 6            CD         107             Upper West Side (CD7)
##           Time.Period Start_Date Data.Value Message
## 1      Winter 2014-15  12/1/2014      23.97      NA
## 2      Winter 2014-15  12/1/2014      27.42      NA
## 3 Annual Average 2017   1/1/2017      12.55      NA
## 4      Winter 2015-16  12/1/2015      22.63      NA
## 5         Summer 2016   6/1/2016      14.00      NA
## 6      Winter 2015-16  12/1/2015      26.43      NA
#summary(AirQualityDataset)
str(AirQualityDataset)
## 'data.frame':    18862 obs. of  12 variables:
##  $ Unique.ID     : int  336867 336741 550157 412802 412803 412676 412677 603044 412804 825832 ...
##  $ Indicator.ID  : int  375 375 375 375 375 375 375 375 375 375 ...
##  $ Name          : Factor w/ 18 levels "Annual vehicle miles traveled",..: 14 14 14 14 14 14 14 14 14 14 ...
##  $ Measure       : Factor w/ 8 levels "Annual average concentration",..: 6 6 6 6 6 6 6 6 6 6 ...
##  $ Measure.Info  : Factor w/ 8 levels "µg/m3","mcg/m3",..: 8 8 8 8 8 8 8 8 8 8 ...
##  $ Geo.Type.Name : Factor w/ 5 levels "Borough","CD",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ Geo.Join.ID   : int  407 107 414 407 407 107 107 314 407 107 ...
##  $ Geo.Place.Name: Factor w/ 114 levels "Bay Ridge and Dyker Heights (CD10)",..: 38 107 82 38 38 107 107 35 38 107 ...
##  $ Time.Period   : Factor w/ 57 levels "2005","2005-2007",..: 49 49 21 50 35 50 35 22 20 56 ...
##  $ Start_Date    : Factor w/ 46 levels "1/1/2005","1/1/2009",..: 22 22 8 23 39 23 39 9 31 29 ...
##  $ Data.Value    : num  24 27.4 12.6 22.6 14 ...
##  $ Message       : logi  NA NA NA NA NA NA ...
#manually deletes columns
AirQualityDataset <- AirQualityDataset[, !names(AirQualityDataset) %in% c("Unique.ID", "Indicator.ID", "Geo.Place.Name", "Geo.Join.ID", "Start_Date", "Message")]

sum(is.na(AirQualityDataset))
## [1] 0
AirQualityDataset <- na.omit(AirQualityDataset)



AirQualityDataset$Name <- as.factor(AirQualityDataset$Name)
AirQualityDataset$Measure <- as.factor(AirQualityDataset$Measure)
AirQualityDataset$Measure.Info <- as.factor(AirQualityDataset$Measure.Info)
AirQualityDataset$Geo.Type.Name <- as.factor(AirQualityDataset$Geo.Type.Name)
AirQualityDataset$Time.Period <- as.factor(AirQualityDataset$Time.Period)
str(AirQualityDataset)
## 'data.frame':    18862 obs. of  6 variables:
##  $ Name         : Factor w/ 18 levels "Annual vehicle miles traveled",..: 14 14 14 14 14 14 14 14 14 14 ...
##  $ Measure      : Factor w/ 8 levels "Annual average concentration",..: 6 6 6 6 6 6 6 6 6 6 ...
##  $ Measure.Info : Factor w/ 8 levels "µg/m3","mcg/m3",..: 8 8 8 8 8 8 8 8 8 8 ...
##  $ Geo.Type.Name: Factor w/ 5 levels "Borough","CD",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ Time.Period  : Factor w/ 57 levels "2005","2005-2007",..: 49 49 21 50 35 50 35 22 20 56 ...
##  $ Data.Value   : num  24 27.4 12.6 22.6 14 ...
unique(AirQualityDataset$Measure.Info)
## [1] ppb                  mcg/m3               number              
## [4] per 100,000 adults   per square mile      per 100,000 children
## [7] µg/m3               per 100,000         
## 8 Levels: µg/m3 mcg/m3 number per 100,000 ... ppb
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
AirQualityDataset <- AirQualityDataset %>%
  mutate(Measure.Info = recode(Measure.Info,
                               "µg/m3" = "mcg/m3",
                               "per 100,000" = "mcg/m3"
                               
                               ))

# Shuffle the rows
shuffled_data <- AirQualityDataset[sample(nrow(AirQualityDataset)), ]
# Get split index (70%)
split_index <- floor(0.7 * nrow(shuffled_data))
# Split into training and test sets
AirQualityDataset_Train <- shuffled_data[1:split_index, ]
AirQualityDataset_Test <- shuffled_data[(split_index + 1):nrow(shuffled_data), ]
cat("Training rows:", nrow(AirQualityDataset_Train), "\n")
## Training rows: 13203
cat("Testing rows:", nrow(AirQualityDataset_Test), "\n")
## Testing rows: 5659
str(AirQualityDataset_Train)
## 'data.frame':    13203 obs. of  6 variables:
##  $ Name         : Factor w/ 18 levels "Annual vehicle miles traveled",..: 13 13 14 16 14 14 17 14 14 9 ...
##  $ Measure      : Factor w/ 8 levels "Annual average concentration",..: 6 6 6 1 6 6 6 6 6 8 ...
##  $ Measure.Info : Factor w/ 6 levels "mcg/m3","number",..: 1 1 6 1 6 6 6 6 6 2 ...
##  $ Geo.Type.Name: Factor w/ 5 levels "Borough","CD",..: 5 5 5 5 5 4 5 2 2 5 ...
##  $ Time.Period  : Factor w/ 57 levels "2005","2005-2007",..: 32 52 17 8 43 24 30 31 53 7 ...
##  $ Data.Value   : num  10.17 7.57 14.69 1.57 30.17 ...
View(AirQualityDataset_Train)

#2.SVM FOR REGRESSION MODEL

#cross validation
#trainControl <- trainControl(method='cv', number=10)
#trainControl <- trainControl(method = 'repeatedcv', number = 10, repeats = 3)
trainControl <- trainControl(method = 'repeatedcv', number = 10, repeats = 3)

library(e1071)

# Use a smaller training set
sample_train <- AirQualityDataset_Train[sample(nrow(AirQualityDataset_Train), 2500), ]
#bigger difference between 5000-10000 than 10000-15000



# Timing the model

 set.seed(123)
 fit.svm <- train(Data.Value ~ ., data = sample_train, method = 'svmRadial', trControl = trainControl)

set.seed(123)
fit.svm1 <- train(Data.Value ~ ., data = sample_train, method = 'svmLinear', trControl = trainControl)


print(fit.svm)
## Support Vector Machines with Radial Basis Function Kernel 
## 
## 2500 samples
##    5 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 2250, 2250, 2250, 2250, 2249, 2252, ... 
## Resampling results across tuning parameters:
## 
##   C     RMSE      Rsquared   MAE     
##   0.25  14.92878  0.5883150  5.558150
##   0.50  14.30274  0.6062455  5.278882
##   1.00  14.15867  0.6033759  5.179564
## 
## Tuning parameter 'sigma' was held constant at a value of 0.006345187
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.006345187 and C = 1.
print(fit.svm1)
## Support Vector Machines with Linear Kernel 
## 
## 2500 samples
##    5 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 2250, 2250, 2250, 2250, 2249, 2252, ... 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   13.89192  0.6149383  5.529905
## 
## Tuning parameter 'C' was held constant at a value of 1
#Visualization Svm radical

predictions <- predict(fit.svm, newdata = AirQualityDataset_Test)
# Actual vs Predicted Plot
plot(AirQualityDataset_Test$Data.Value, predictions,
     main = "Actual vs Predicted (SVM Radical)",
     xlab = "Actual Values",
     ylab = "Predicted Values",
     col = "blue", pch = 19)
abline(0, 1, col = "red")  # ideal line

# Calculate residuals
residuals <- AirQualityDataset_Test$Data.Value - predictions
# Residuals vs Predicted Plot
plot(predictions, residuals,
     main = "Residuals vs Predicted",
     xlab = "Predicted",
     ylab = "Residuals",
     col = "darkgreen", pch = 19)
abline(h = 0, col = "red")

histogram
## function (x, data, ...) 
## UseMethod("histogram")
## <bytecode: 0x15b6a55c8>
## <environment: namespace:lattice>
hist(residuals, main = "Histogram of Residuals", col = "purple", xlab = "Residuals")

#Visualization svm Radical

predictions1 <- predict(fit.svm1, newdata = AirQualityDataset_Test)
# Actual vs Predicted Plot
plot(AirQualityDataset_Test$Data.Value, predictions1,
     main = "Actual vs Predicted (SVM Linear)",
     xlab = "Actual Values",
     ylab = "Predicted Values",
     col = "blue", pch = 19)
abline(0, 1, col = "red")  # ideal line

# Calculate residuals
residuals1 <- AirQualityDataset_Test$Data.Value - predictions1
# Residuals vs Predicted Plot
plot(predictions1, residuals1,
     main = "Residuals vs Predicted",
     xlab = "Predicted",
     ylab = "Residuals",
     col = "darkgreen", pch = 19)
abline(h = 0, col = "red")

histogram
## function (x, data, ...) 
## UseMethod("histogram")
## <bytecode: 0x15b6a55c8>
## <environment: namespace:lattice>
hist(residuals1, main = "Histogram of Residuals", col = "purple", xlab = "Residuals")

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.