CA#10: Vis Reg

getwd()

## [1] "C:/Users/m0965094/OneDrive/Desktop/1 - STU - DATA ANALYTICS/7 - DMML/CA#10"

# make sure the packages for this chapter
# are installed, install if necessary
#pkg <- c("ggplot2", "scales", "maptools",
#              "sp", "maps", "grid", "car" )
#new.pkg <- pkg[!(pkg %in% installed.packages())]
#if (length(new.pkg)) {
  #install.packages(new.pkg)  
#}

# read the CSV with headers
za <- read.csv("C:/Users/m0965094/OneDrive/Desktop/1 - STU - DATA ANALYTICS/7 - DMML/CA#10/zeroaccess.csv", header=T,sep ="," )
#View(za)

# Load ggplot2 to create graphics
#library(ggplot2)
# create a ggplot instance with zeroaccess data
#gg <- ggplot(data=za, aes(x=long, y=lat)) 
# add the points, set transparency to 1/40th 
#gg <- gg + geom_point(size=1, color="#000099", alpha=1/40) 
# add axes labels
#gg <- gg + xlab("Longitude") + ylab("Latitude")
# simplify the theme for aesthetics
#gg <- gg + theme_bw() 
# this may take a while, over 800,000 points plotted
#print(gg)

#install.packages("mapproj")
#library(mapproj)

# requires package : ggplot2
# requires object: za (5-1)
# the "maps" and "mapproj" packages are used by ggplot
# load map data of the world
#world <- map_data("world")
#Remove Antarctica
#world <- subset(world, world$region!="Antarctica")
# load world data into ggplot object
#gg <- ggplot(data=world, aes(x=long, y=lat))
# trace along the lat/long coords by group (countries)
#gg <- gg + geom_path(aes(group=group), colour="gray70")
# now project using the mercator projection
# try different projections with ?mapproject
#gg <- gg + coord_map("mercator", xlim=c(-200, 200))
# load up the ZeroAccess points, overiding the default data set
#gg <- gg + geom_point(data=za, aes(long, lat), 
#                      colour="#000099", alpha=1/40, size=1)
# remove text, axes ticks, grid lines and do gray border on white
#gg <- gg + theme(text=element_blank(), 
#                 axis.ticks=element_blank(),
#                 panel.grid=element_blank(),
#                 panel.background=element_rect(color="gray50",
#                                               fill="white"))
#print(gg)

county.data <- read.csv("C:/Users/m0965094/OneDrive/Desktop/1 - STU - DATA ANALYTICS/7 - DMML/CA#10/countydataset.csv", header=T,sep = ",")
#View(county.data)
#set.seed(1)
# generate 200 random numbers around 10
#input <- rnorm(200, mean=10)
#summary(input)
# requires objects: input (5-16)
# generate output around a mean of 2 x input
#output <- rnorm(200, mean=input*2)
# put into data frame to plot it
#our.data <- data.frame(input, output)
#gg <- ggplot(our.data, aes(input, output))
#gg <- gg + geom_point()
#gg <- gg + geom_smooth(method = "lm", se=F, color="red")
#gg <- gg + theme_bw()
#print(gg)

#Make comments about both the syntax and the output of the task executed above. Is it possible to customize this graph to make it more explanatory?

#QUESTION: Is the input significant at a 5% significance level? 1%?

#model <- lm(output ~ input)
#summary(model)

#confint(model)

#summary(lm(county.data$Infections ~ county.data$ufo2010, data= county.data))

#View(county.data)

#summary(lm(Infections ~ pop + income + ipaddr + ufo2010, 
#           data=county.data))

#install.packages("carData")
#library(car) # for the vif() function

#model <- lm(Infections ~ pop + income + ipaddr + ufo2010, 
#            data=county.data)
#sqrt(vif(model))

#summary(lm(Infections ~ pop, data=county.data))

#QUESTION: PREDICT FOR 1 000 000 AND 2 000 000
#RECREATE THIS SOLUTION BUILDING MODEL THAT DEPENDS ON INCOME
    # PREDICT NUMBER OF INFECTIONS BASED ON VARIABLE INCOME: 10 000, 20 000, 40 000, 90 000

pop.lm <- lm(Infections ~ pop, data=county.data)
predict(pop.lm, data.frame(pop=6000000), interval="confidence")

##        fit      lwr      upr
## 1 378.7109 295.9209 461.5009

#Answer: Number infections based of Pupulation = 1 000 000.  
pop.lm <- lm(Infections ~ pop, data=county.data)
predict(pop.lm, data.frame(pop=1000000), interval="confidence")

##        fit      lwr      upr
## 1 167.3069 153.9224 180.6915

#Answer: Number infections based of Pupulation = 2 000 000. 
pop.lm <- lm(Infections ~ pop, data=county.data)
predict(pop.lm, data.frame(pop=2000000), interval="confidence")

##        fit      lwr      upr
## 1 209.5877 182.5947 236.5807

#Answer: predict number of infections. Income = 10 000$ 
pop.lm <- lm(Infections ~ income, data=county.data)
predict(pop.lm, data.frame(income=10000), interval="confidence")

##        fit      lwr      upr
## 1 100.2013 85.70806 114.6945

#Answer: predict number of infections. Income = 20 000$ 
pop.lm <- lm(Infections ~ income, data=county.data)
predict(pop.lm, data.frame(income=20000), interval="confidence")

##        fit      lwr      upr
## 1 108.4966 97.66147 119.3318

#Answer: predict number of infections. Income = 40 000$ 
pop.lm <- lm(Infections ~ income, data=county.data)
predict(pop.lm, data.frame(income=40000), interval="confidence")

##        fit      lwr      upr
## 1 125.0872 120.1358 130.0387

#Answer: predict number of infections. Income = 90 000$ 
pop.lm <- lm(Infections ~ income, data=county.data)
predict(pop.lm, data.frame(income=90000), interval="confidence")

##        fit      lwr      upr
## 1 166.5638 148.3582 184.7694