# Let go back and analyze the air quality dataset (if you remember, we used that previously, in the visualization lab). Remember to think about how to deal with the NAs in the data.
Warning messages:
1: In readChar(file, size, TRUE) : truncating string with embedded nuls
2: In readChar(file, size, TRUE) : truncating string with embedded nuls
3: In readChar(file, size, TRUE) : truncating string with embedded nuls
4: In readChar(file, size, TRUE) : truncating string with embedded nuls
5: In readChar(file, size, TRUE) : truncating string with embedded nuls
6: In readChar(file, size, TRUE) : truncating string with embedded nuls
7: In readChar(file, size, TRUE) : truncating string with embedded nuls
8: In readChar(file, size, TRUE) : truncating string with embedded nuls
library(e1071)
library(ggplot2)
library(kernlab)
library(gridExtra)
airdata <- data.frame(airquality)
airdataNONA <- na.omit(airdata)
# Using techniques discussed in class, create two datasets – one for training and one for testing.
randIndex <- sample(1:nrow(airdataNONA))
nr <- nrow(airdataNONA)
cutpoint3_4 <-floor(3 * nr/4)
TestingAir <- airdataNONA[randIndex[1:cutpoint3_4],]
print(TestingAir)
# Build a model (using the ‘ksvm’ function, trying to predict onzone). You can use all the possible attributes, or select the attributes that you think would be the most helpful.
# Test the model on the testing dataset, and compute the Root Mean Squared Error 3) Plot the results. Use a scatter plot. Have the x-axis represent temperature, the y-axis represent wind, the point size and color represent the error, as defined by the actual ozone level minus the predicted ozone level).
# Compute models and plot the results for ‘svm’ (in the e1071 package) and ‘lm’. Generate similar charts for each model
# Show all three results (charts) in one window, using the grid.arrange function
rootSquare <- function(error)
{
sqrt(mean(error^2))
}
modelKSVM <- ksvm(Ozone ~ ., data = airdataNONA)
modelKSVM
Support Vector Machine object of class "ksvm"
SV type: eps-svr (regression)
parameter : epsilon = 0.1 cost C = 1
Gaussian Radial Basis kernel function.
Hyperparameter : sigma = 0.183415687283751
Number of Support Vectors : 90
Objective Function Value : -27.4819
Training error : 0.246222
ModelOzone <- function(X, airdataNONA){
ModelOzone <- predict(X, airdataNONA)
resultsFunc <- table(ModelOzone, airdataNONA$Ozone)
print(resultsFunc)
percentCorrect <- (resultsFunc[1,1]+resultsFunc[2,2])/(resultsFunc[1,1]+resultsFunc[1,2]+resultsFunc[2,1]+resultsFunc[2, 2])*100
round(percentCorrect)
return(percentCorrect)
}
modelKSVMPredict<- predict(modelKSVM, airdataNONA)
modelKSVMError <- (airdataNONA$Ozone - modelKSVMPredict)
rootSquare(modelKSVMError)
[1] 16.51178
ModelOzone(modelKSVM,airdataNONA)
ModelOzone 1 4 6 7 8 9 10 11 12 13 14 16 18 19 20 21 22 23 24 27 28 29 30 31 32 34 35 36 37 39 40 41 44 45 46 47 48 49 50 52 59 61 63 64 65 71
9.33932510270754 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
9.77124171502148 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
10.3297558179981 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
10.7722749915059 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
11.3151240757621 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
11.3352459941779 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
11.4597889398027 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
12.3378840653615 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
12.3440502652265 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
12.967893355747 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
13.3365934942842 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
13.444897211895 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
13.920975349663 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
14.0557112943695 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
14.2386879995762 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
ModelOzone 73 76 77 78 79 80 82 84 85 89 91 96 97 108 110 115 118 122 135 168
9.33932510270754 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
9.77124171502148 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
10.3297558179981 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
10.7722749915059 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
11.3151240757621 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
11.3352459941779 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
11.4597889398027 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
12.3378840653615 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
12.3440502652265 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
12.967893355747 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
13.3365934942842 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
13.444897211895 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
13.920975349663 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
14.0557112943695 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
14.2386879995762 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[ reached getOption("max.print") -- omitted 96 rows ]
[1] NaN
modelSVM <- svm(Ozone ~ ., data = airdataNONA)
ModelOzone(modelSVM, airdataNONA)
ModelOzone 1 4 6 7 8 9 10 11 12 13 14 16 18 19 20 21 22 23 24 27 28 29 30 31 32 34 35 36 37 39 40 41 44 45 46 47 48 49 50 52 59 61 63 64 65 71
9.31757225549279 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
9.73110516752558 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
10.3302246381265 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
10.6264934548199 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
11.038041152856 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
11.3248275352304 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
11.4964482717066 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
12.3333528447084 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
12.334204054021 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
12.5172892908793 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
13.0499256012054 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
13.1966953821272 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
13.3304929160326 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
13.4769768005589 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
13.5227707952261 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
ModelOzone 73 76 77 78 79 80 82 84 85 89 91 96 97 108 110 115 118 122 135 168
9.31757225549279 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
9.73110516752558 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
10.3302246381265 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
10.6264934548199 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
11.038041152856 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
11.3248275352304 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
11.4964482717066 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
12.3333528447084 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
12.334204054021 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
12.5172892908793 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
13.0499256012054 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
13.1966953821272 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
13.3304929160326 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
13.4769768005589 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
13.5227707952261 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[ reached getOption("max.print") -- omitted 96 rows ]
[1] NaN
modelSVMPredict <- predict(modelSVM, airdataNONA)
modelSVMError <- (airdataNONA$Ozone - modelSVMPredict)
rootSquare(modelSVMError)
[1] 16.39104
modelLM <- lm(Ozone ~., data=airdataNONA)
modelLMPredict <- predict(modelLM, airdataNONA)
modelLMError <- (airdataNONA$Ozone - modelLMPredict)
rootSquare(modelLMError)
[1] 20.28689
ModelOne <- ggplot(data = airdataNONA,aes(x=airdataNONA$Temp,y=airdataNONA$Wind)) + geom_point(shape = 1, color = "orange") + ggtitle("KSVM Model")
plot(ModelOne)
ModelTwo <- data.frame(airdataNONA$Wind,airdataNONA$Temp,modelSVMError)
colnames(ModelTwo) <- c("Wind","Temp","Error")
ModelTwoPlot <- ggplot(data = airdataNONA,aes(x=airdataNONA$Temp,y=airdataNONA$Wind)) + geom_point(aes(size=modelSVMError), color = "yellow") + ggtitle("SVM Model")
plot(ModelTwoPlot)
ModelThree <- data.frame(airdataNONA$Wind,airdataNONA$Temp,modelLMError)
colnames(ModelThree) <- c("Wind","Temp","Error")
ModelThreePlot <- ggplot(data = ModelThree,aes(x=airdataNONA$Temp,y=airdataNONA$Wind)) + geom_point(aes(size=modelLMError), color = "green") + ggtitle("LM Model")
plot(ModelThreePlot)
grid.arrange(ModelOne, ModelTwoPlot, ModelThreePlot, ncol = 2)
# This variable should be either 0 or 1. It should be 0 if the ozone is below the average for all the data observations, and 1 if it is equal to or above the average ozone observed.
airdataNONA$goodOzone <- as.factor(ifelse(airdataNONA$Ozone < mean(airdataNONA$Ozone),0,1))
# Build a model (using the ‘ksvm’ function, trying to predict ‘goodOzone’). You can use all the possible attributes, or select the attributes that you think would be the most helpful.
# Test the model on the testing dataset, and compute the percent of ‘goodOzone’ that was correctly predicted.
# Plot the results. Use a scatter plot. Have the x-axis represent temperature, the y-axis represent wind, the shape representing what was predicted (good or bad day), the color representing the actual value of ‘goodOzone’ (i.e. if the actual ozone level was good) and the size represent if the prediction was correct (larger symbols should be the observations the model got wrong).
# Compute models and plot the results for ‘svm’ (in the e1071 package) and ‘nb’ (Naive Bayes, also in the e1071 package).
# Show all three results (charts) in one window, using the grid.arrange function (have two charts in one row).
modelKSVMq <- ksvm(goodOzone ~ ., data = airdataNONA)
modelKSVMPredictq<- predict(modelKSVMq, airdataNONA)
modelKSVMErrorq <- (airdataNONA$goodOzone - modelKSVMPredictq)
㤼㸱-㤼㸲 not meaningful for factors
rootSquare(modelKSVMErrorq)
[1] NA
ModelOzoneq(modelKSVMq,airdataNONA)
ModelOzoneq 0 1
0 69 2
1 0 40
[1] 98.1982
modelSVMq <- svm(goodOzone ~ ., data = airdataNONA)
ModelOzoneq(modelSVMq, airdataNONA)
ModelOzoneq 0 1
0 69 3
1 0 39
[1] 97.2973
modelSVMPredictq <- predict(modelSVMq, airdataNONA)
modelSVMErrorq <- (airdataNONA$goodOzone - modelSVMPredictq)
㤼㸱-㤼㸲 not meaningful for factors
rootSquare(modelSVMErrorq)
[1] NA
modelNB <- naiveBayes(goodOzone ~ ., data = airdataNONA)
ModelOzoneq(modelNB, airdataNONA)
ModelOzoneq 0 1
0 68 2
1 1 40
[1] 97.2973
modelNBError <- predict(modelNB, PredictiveData)
Grifter <- data.frame(PredictiveData$goodOzone)
modelNBGrift <- predict(modelNB, Grifter)
Type mismatch between training and new data for variable 'Ozone'. Did you use factors with numeric labels for training, and numeric values for new data?Type mismatch between training and new data for variable 'Solar.R'. Did you use factors with numeric labels for training, and numeric values for new data?Type mismatch between training and new data for variable 'Wind'. Did you use factors with numeric labels for training, and numeric values for new data?Type mismatch between training and new data for variable 'Temp'. Did you use factors with numeric labels for training, and numeric values for new data?Type mismatch between training and new data for variable 'Month'. Did you use factors with numeric labels for training, and numeric values for new data?Type mismatch between training and new data for variable 'Day'. Did you use factors with numeric labels for training, and numeric values for new data?
ModelOneT <- ggplot(data = airdataNONA,aes(x=airdataNONA$Temp,y=airdataNONA$Wind)) + geom_point(shape = modelKSVMErrorq, color = "orange") + ggtitle("KSVM Model")
plot(ModelOneT)
ModelTwoT <- data.frame(airdataNONA$Wind,airdataNONA$Temp,modelSVMErrorq)
colnames(ModelTwoT) <- c("Wind","Temp","Error")
ModelTwoPlotT <- ggplot(data = airdataNONA,aes(x=airdataNONA$Temp,y=airdataNONA$Wind)) + geom_point(aes(size=modelSVMErrorq), color = "yellow") + ggtitle("SVM Model")
plot(ModelTwoPlotT)
ModelB <- data.frame(airdataNONA$Wind,airdataNONA$Temp,modelNBError)
colnames(ModelB) <- c("Wind","Temp","Error")
PlotB <- ggplot(data = airdataNONA,aes(x=airdataNONA$Temp,y=airdataNONA$Wind)) + geom_point(aes(size=modelNBError), color = "red") + ggtitle("Naive Bayes Model")
plot(PlotB)
grid.arrange(ModelOneT, ModelTwoPlotT, PlotB, ncol = 2)
NA
NA
NA
Review what you have done and state which is the best and why #Naive Bayes os more responsive with a discrete threshold