Handout10_Ch5_Vis

–Welcome to Lecture#8,Chapter V.

Let us start by finding our working directoty.

getwd()

## [1] "C:/Users/n0910803/Downloads"

# make sure the packages for this chapter
# are installed, install if necessary
#pkg <- c("ggplot2", "scales", "maptools",
    #          "sp", "maps", "grid", "car" )
#new.pkg <- pkg[!(pkg %in% installed.packages())]
#if (length(new.pkg)) {
 # install.packages(new.pkg)  
#}

Let us now make sure that the file zeroaccess is uploaded into your new project. Then let us define the variable za.

# read the CSV with headers
za <- read.csv("zeroaccess.csv", header=T,sep ="," )
#View(za)

What is the output we have just obtained after running the chunk above.

#we have just obtained latitude and logitud. It could be from users, servers.

# Load ggplot2 to create graphics
library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.2.3

# create a ggplot instance with zeroaccess data
gg <- ggplot(data=za, aes(x=long, y=lat)) 
# add the points, set transparency to 1/40th 
gg <- gg + geom_point(size=1, color="#000099", alpha=1/40) 
# add axes labels
gg <- gg + xlab("Longitude") + ylab("Latitude")
# simplify the theme for aesthetics
gg <- gg + theme_bw() 
# this may take a while, over 800,000 points plotted
print(gg)

#install.packages("mapproj")

library(mapproj)

## Warning: package 'mapproj' was built under R version 4.2.3

## Loading required package: maps

## Warning: package 'maps' was built under R version 4.2.3

# requires package : ggplot2
# requires object: za (5-1)
# the "maps" and "mapproj" packages are used by ggplot
# load map data of the world
world <- map_data("world")
#Remove Antarctica
world <- subset(world, world$region!="Antarctica")
# load world data into ggplot object
gg <- ggplot(data=world, aes(x=long, y=lat))
# trace along the lat/long coords by group (countries)
gg <- gg + geom_path(aes(group=group), colour="gray70")
# now project using the mercator projection
# try different projections with ?mapproject
gg <- gg + coord_map("mercator", xlim=c(-200, 200))
# load up the ZeroAccess points, overiding the default data set
gg <- gg + geom_point(data=za, aes(long, lat), 
                      colour="#000099", alpha=1/40, size=1)
# remove text, axes ticks, grid lines and do gray border on white
gg <- gg + theme(text=element_blank(), 
                 axis.ticks=element_blank(),
                 panel.grid=element_blank(),
                 panel.background=element_rect(color="gray50",
                                               fill="white"))
print(gg)

Interpret the output above.

#high population density in U.S., Brazil, Europe and India/China. 
#GGPlot to build maps

#Let us proceed to define the file county.dataset found on blackboard.

# read up census data per county
county.data <- read.csv("countydataset.csv", header=T,sep = ",")

#View(county.data)

set.seed(1)
# generate 200 random numbers around 10
input <- rnorm(200, mean=10)
summary(input)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   7.785   9.386   9.951  10.036  10.613  12.402

Interpret the synatx and the output above.

#The code took a sample of 200 random numbers of the County data set that have a mean of 10

# requires objects: input (5-16)
# generate output around a mean of 2 x input
output <- rnorm(200, mean=input*2)
# put into data frame to plot it
our.data <- data.frame(input, output)
gg <- ggplot(our.data, aes(input, output))
gg <- gg + geom_point()
gg <- gg + geom_smooth(method = "lm", se=F, color="red")
gg <- gg + theme_bw()
print(gg)

## `geom_smooth()` using formula = 'y ~ x'

Make comments about both the syntax and the output of the task executed above. Is it possible to customize this graph to make it more explanatory? If so explain and run the code.

Let us now run output vs input.

model <- lm(output ~ input)
summary(model)

## 
## Call:
## lm(formula = output ~ input)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.93275 -0.54273 -0.02523  0.66833  2.58615 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.27224    0.77896   0.349    0.727    
## input        1.97692    0.07729  25.577   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.013 on 198 degrees of freedom
## Multiple R-squared:  0.7677, Adjusted R-squared:  0.7665 
## F-statistic: 654.2 on 1 and 198 DF,  p-value: < 2.2e-16

Is the input significant at a 5% significance level?1%?

#Yes, because P value is less than 0.05
#still yes, because is lower than 0.01

Let us now proceed to get the confidence interval for this model.

confint(model)

##                 2.5 %   97.5 %
## (Intercept) -1.263895 1.808368
## input        1.824502 2.129343

#we look at the distribution of the values. 
#if input does not go through '0' the model is significant.

Using the county.data dataset, let us run a model of Infections vs ufo2010(Aliens Visits according to the UFO).

summary(lm(county.data$Infections ~ county.data$ufo2010, data= county.data))

## 
## Call:
## lm(formula = county.data$Infections ~ county.data$ufo2010, data = county.data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -407.50  -70.35   -5.78   60.05 2736.39 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         124.77740    2.38569  52.302  < 2e-16 ***
## county.data$ufo2010   0.56899    0.07998   7.114  1.4e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 127.5 on 3070 degrees of freedom
## Multiple R-squared:  0.01622,    Adjusted R-squared:  0.0159 
## F-statistic: 50.61 on 1 and 3070 DF,  p-value: 1.398e-12

Given the output printed above, is ufo2010 significant at a 5% significance level?1%?

Let us now run a model of Infections vs every single quantitative variable that is included in the dataset.

#View(county.data)

summary(lm(Infections ~ pop + income + ipaddr + ufo2010, 
           data=county.data))

## 
## Call:
## lm(formula = Infections ~ pop + income + ipaddr + ufo2010, data = county.data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -342.94  -70.34   -5.40   58.85 2745.36 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.000e+02  9.426e+00  10.610  < 2e-16 ***
## pop         -1.615e-05  1.543e-05  -1.047  0.29510    
## income       5.670e-04  2.066e-04   2.745  0.00609 ** 
## ipaddr      -9.865e-08  5.148e-07  -0.192  0.84803    
## ufo2010      6.804e-01  1.691e-01   4.024 5.85e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 127.3 on 3067 degrees of freedom
## Multiple R-squared:  0.01879,    Adjusted R-squared:  0.01751 
## F-statistic: 14.69 on 4 and 3067 DF,  p-value: 6.915e-12

#there are more features. So there is MLR.
#Income is significant. We reject the null. More income, more infections (because estimate is positive, so it increases.)
#POP isn't.
#ipaddress
#UFO2010 is signifcant. So we reject the null.

#is the model as a whole, significant? Yes, because p-value: 6.915e-12 is less than 0.01/0.05.
# The global test is Ho: B1=B2=B3=b4   H1: otherwise (At least one of these conditions is not met)
#R-Squared is 0.01751 which indicates that this model only explains 1.7% of the relationships. So, its a signicant model but only explains 1.7% of the infections.
#F

Interpret the output. How would you proceed from now on in this handout given the results obtained above.

#install.packages("carData")

library(car) # for the vif() function

## Warning: package 'car' was built under R version 4.2.3

## Loading required package: carData

## Warning: package 'carData' was built under R version 4.2.3

Let us just explore the variance inflation factor(VIF) of the model to see if there is a chance of high correlation between my predictors. I remind you that a strong correlation between two of my predictors will likely end up in heteroskedasticity, and therefore our model would not be accuarate.

model <- lm(Infections ~ pop + income + ipaddr + ufo2010, 
            data=county.data)
sqrt(vif(model))

##      pop   income   ipaddr  ufo2010 
## 2.165458 1.038467 1.046051 2.115512

Let us see if population affects the number of infections in this dataset. Write the null and alternative hypothesis you would use to test this relationship.

summary(lm(Infections ~ pop, data=county.data))

## 
## Call:
## lm(formula = Infections ~ pop, data = county.data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -365.26  -70.35   -5.70   59.59 2724.00 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.250e+02  2.416e+00  51.755  < 2e-16 ***
## pop         4.228e-05  7.147e-06   5.916 3.67e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 127.8 on 3070 degrees of freedom
## Multiple R-squared:  0.01127,    Adjusted R-squared:  0.01095 
## F-statistic:    35 on 1 and 3070 DF,  p-value: 3.668e-09

Interpret the results obtained above.

Now let us define the regression of Infections vs pop as pop.lm and predict the number of infections based on the variable population.

pop.lm <- lm(Infections ~ pop, data=county.data)
predict(pop.lm, data.frame(pop=6000000), interval="confidence")

##        fit      lwr      upr
## 1 378.7109 295.9209 461.5009

#It is very likely that I get 378.7109 infections in an area with pop=6M

#predict the number of infections based on the following values for the variable population, 1.000.000, 2.000.000.
#Recreat this solution, building a model that depends on income.

pop.lm <- lm(Infections ~ pop, data=county.data)
predict(pop.lm, data.frame(pop=1000000), interval="confidence")

##        fit      lwr      upr
## 1 167.3069 153.9224 180.6915

#It is very likely that I get 167.3069 infections in an area with pop=1M

pop.lm <- lm(Infections ~ pop, data=county.data)
predict(pop.lm, data.frame(pop=2000000), interval="confidence")

##        fit      lwr      upr
## 1 209.5877 182.5947 236.5807

#predict the number of infections based on the following values for the variable Income, 10.000, 20.000, 40.000,90.000
#Recreat this solution, building a model that depends on income.

inc.lm <- lm(Infections ~ income, data=county.data)
predict(inc.lm, data.frame(income=10000), interval="confidence")

##        fit      lwr      upr
## 1 100.2013 85.70806 114.6945

#For income $10.000 the most likely number of infections is 100.2013 on average

inc.lm <- lm(Infections ~ income, data=county.data)
predict(inc.lm, data.frame(income=20000), interval="confidence")

##        fit      lwr      upr
## 1 108.4966 97.66147 119.3318

inc.lm <- lm(Infections ~ income, data=county.data)
predict(inc.lm, data.frame(income=40000), interval="confidence")

##        fit      lwr      upr
## 1 125.0872 120.1358 130.0387

inc.lm <- lm(Infections ~ income, data=county.data)
predict(inc.lm, data.frame(income=90000), interval="confidence")

##        fit      lwr      upr
## 1 166.5638 148.3582 184.7694

Interpret the results obtained above.

Handout10_Ch5_Vis_Reg