getwd()
[1] "/cloud/project"
#next, we need to install packages
#ggplot is used for highly custoizable simple to complex visualizations , maptools is used ti import, export and manipulate shapeflies. It hellps yo perform basic geospatial analysis.
pkg <- c("ggplot2", "scales", "maptools",
"sp", "maps", "grid", "car" )
new.pkg <- pkg[!(pkg %in% installed.packages())]
if (length(new.pkg)) {
install.packages(new.pkg)
}
Installing package into ‘/cloud/lib/x86_64-pc-linux-gnu-library/4.4’
(as ‘lib’ is unspecified)
Warning in install.packages :
package ‘maptools’ is not available for this version of R
A version of this package for your version of R might be available elsewhere,
see the ideas at
https://cran.r-project.org/doc/manuals/r-patched/R-admin.html#Installing-packages
# read the CSV with headers
za <- read.csv("zeroaccess.csv", header=T,sep ="," )
#In this line of code, we can view the csv file that we saved as the variable za
View(za)
#we use the library function to load ggplot and create graphics
library(ggplot2)
# create a ggplot instance with zeroaccess data
gg <- ggplot(data=za, aes(x=long, y=lat))
# add the points, set transparency to 1/40th
gg <- gg + geom_point(size=1, color="#000099", alpha=1/40)
# add axes labels
gg <- gg + xlab("Longitude") + ylab("Latitude")
# simplify the theme for aesthetics
gg <- gg + theme_bw()
# this may take a while, over 800,000 points plotted
print(gg)
<!-- rnb-text-end -->
<!-- rnb-chunk-begin -->
<!-- rnb-output-begin eyJkYXRhIjoiXG48IS0tIHJuYi1zb3VyY2UtYmVnaW4gZXlKa1lYUmhJam9pWUdCZ2NseHVJMHhsZENkeklHbHVkbVZ6ZEdsbllYUmxJSGRvWlhSb1pYSWdjRzl3ZFd4aGRHbHZiaUJvWVhNZ1lXNGdhVzF3WVdOMElHOXVJSFJvWlNCdWRXMWlaWElnYjJZZ2FXNW1aV04wYVc5dWN5QnBiaUIwYUdseklHUmhkR0Z6WlhRdUlFWnZjbTExYkdGMFpTQjBhR1VnYm5Wc2JDQmhibVFnWVd4MFpYSnVZWFJwZG1VZ2FIbHdiM1JvWlhObGN5QjBieUIwWlhOMElIUm9hWE1nY21Wc1lYUnBiMjV6YUdsd0xseHVjM1Z0YldGeWVTaHNiU2hKYm1abFkzUnBiMjV6SUg0Z2NHOXdMQ0JrWVhSaFBXTnZkVzUwZVM1a1lYUmhLU2xjYm1CZ1lDSjkgLS0+XG5cbmBgYHJcbiNMZXQncyBpbnZlc3RpZ2F0ZSB3aGV0aGVyIHBvcHVsYXRpb24gaGFzIGFuIGltcGFjdCBvbiB0aGUgbnVtYmVyIG9mIGluZmVjdGlvbnMgaW4gdGhpcyBkYXRhc2V0LiBGb3JtdWxhdGUgdGhlIG51bGwgYW5kIGFsdGVybmF0aXZlIGh5cG90aGVzZXMgdG8gdGVzdCB0aGlzIHJlbGF0aW9uc2hpcC5cbnN1bW1hcnkobG0oSW5mZWN0aW9ucyB+IHBvcCwgZGF0YT1jb3VudHkuZGF0YSkpXG5gYGBcblxuPCEtLSBybmItc291cmNlLWVuZCAtLT5cbiJ9 -->
<!-- rnb-source-begin eyJkYXRhIjoiYGBgclxuI0xldCdzIGludmVzdGlnYXRlIHdoZXRoZXIgcG9wdWxhdGlvbiBoYXMgYW4gaW1wYWN0IG9uIHRoZSBudW1iZXIgb2YgaW5mZWN0aW9ucyBpbiB0aGlzIGRhdGFzZXQuIEZvcm11bGF0ZSB0aGUgbnVsbCBhbmQgYWx0ZXJuYXRpdmUgaHlwb3RoZXNlcyB0byB0ZXN0IHRoaXMgcmVsYXRpb25zaGlwLlxuc3VtbWFyeShsbShJbmZlY3Rpb25zIH4gcG9wLCBkYXRhPWNvdW50eS5kYXRhKSlcbmBgYCJ9 -->
```r
#Let's investigate whether population has an impact on the number of infections in this dataset. Formulate the null and alternative hypotheses to test this relationship.
summary(lm(Infections ~ pop, data=county.data))
Call:
lm(formula = Infections ~ pop, data = county.data)
Residuals:
Min 1Q Median 3Q Max
-365.26 -70.35 -5.70 59.59 2724.00
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.250e+02 2.416e+00 51.755 < 2e-16 ***
pop 4.228e-05 7.147e-06 5.916 3.67e-09 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 127.8 on 3070 degrees of freedom
Multiple R-squared: 0.01127, Adjusted R-squared: 0.01095
F-statistic: 35 on 1 and 3070 DF, p-value: 3.668e-09
# requires package : ggplot2
# requires object: za (5-1)
# the "maps" and "mapproj" packages are used by ggplot
# load map data of the world
world <- map_data("world")
#Remove Antarctica
world <- subset(world, world$region!="Antarctica")
# load world data into ggplot object
gg <- ggplot(data=world, aes(x=long, y=lat))
# trace along the lat/long coords by group (countries)
gg <- gg + geom_path(aes(group=group), colour="gray70")
# now project using the mercator projection
# try different projections with ?mapproject
gg <- gg + coord_map("mercator", xlim=c(-200, 200))
# load up the ZeroAccess points, overiding the default data set
gg <- gg + geom_point(data=za, aes(long, lat),
colour="#000099", alpha=1/40, size=1)
# remove text, axes ticks, grid lines and do gray border on white
gg <- gg + theme(text=element_blank(),
axis.ticks=element_blank(),
panel.grid=element_blank(),
panel.background=element_rect(color="gray50",
fill="white"))
print(gg)
#The function map_data(“world”) loads a pre-built dataset of world map data, which contains coordinates for #the countries of the world. This data includes latitude, longitude, and country names. #world <- subset(world, world$region!=“Antarctica”)This line removes Antarctica from the world map data by #filtering out any rows where the region column equals “Antarctica.” #The ggplot() function initializes a ggplot object with the world map data. Here, it maps the longitude #(long) and latitude (lat) from the world dataset to the x and y axes of the plot #The geom_path() function is used to draw the borders of countries by connecting the latitude and longitude #coordinates in each country (defined by group). The color of the lines is set to gray70 (a light gray #color).coord_map(“mercator”) applies the Mercator projection to the world map. The xlim=c(-200, 200) #argument sets the longitude limits for the x-axis, essentially focusing the map on the typical world view. #geom_point() adds points to the map based on the data in the za dataset (likely a dataset with latitudes and #longitudes for certain locations, such as ZeroAccess points). ##size (size=1). theme() is used to customize the appearance of the map: ##axis.ticks=element_blank() removes axis ticks. ###border around the plot. #print(gg) outputs the final map to the R console or plotting window.
pop.lm <- lm(Infections ~ pop, data=county.data)
predict(pop.lm, data.frame(pop=6000000), interval="confidence")
fit lwr upr
1 378.7109 295.9209 461.5009
View(county.data)
set.seed(1)
# generate 200 random numbers around 10
input <- rnorm(200, mean=10)
summary(input)
#interpret the syntax above
#set.seed(1) sets the random number generator's seed to 1. This ensures that any random processes, like #generating random numbers, will produce the same result every time the code is run. It's useful for #reproducibility, so you or others can get the exact same output when running the code.
#rnorm(200, mean=10) generates 200 random numbers from a normal distribution (a bell curve) with:
#A mean of 10, meaning the numbers will be centered around 10.
#A standard deviation of 1 (the default value). This means most of the numbers will fall between 8 and 12, #but there will also be some that are outside of that range.
#The resulting 200 numbers are stored in the variable input.
#summary(input) provides a summary statistics of the generated data in input, including:
#Min: The minimum value in the dataset.
#1st Qu. (Quartile): The first quartile (25th percentile) value.
#Median: The median (50th percentile) value, which represents the middle of the data.
#Mean: The average of the dataset.
#3rd Qu. (Quartile): The third quartile (75th percentile) value.
#Max: The maximum value in the dataset.
# requires objects: input (5-16)
# generate output around a mean of 2 x input
output <- rnorm(200, mean=input*2)
# put into data frame to plot it
our.data <- data.frame(input, output)
gg <- ggplot(our.data, aes(input, output))
gg <- gg + geom_point()
gg <- gg + geom_smooth(method = "lm", se=F, color="red")
gg <- gg + theme_bw()
print(gg)
#In this case, since output is generated to be around 2 * input, the scatter plot would likely show a #positive linear relationship between input and output. The red line would be a straight line with a #positive slope, reflecting the output being approximately twice the input.
# Set seed and generate random data
set.seed(1)
input <- rnorm(200, mean=10)
output <- rnorm(200, mean=input*2)
# Put data into a data frame
our.data <- data.frame(input, output)
# Create the ggplot object
gg <- ggplot(our.data, aes(input, output))
# Add scatter plot points with customized appearance (larger and semi-transparent)
gg <- gg + geom_point(color="blue", alpha=0.6, size=2)
# Add a linear regression line in red
gg <- gg + geom_smooth(method="lm", se=F, color="red")
# Customize labels and title
gg <- gg + labs(
title="Scatter Plot with Linear Regression Line",
x="Input Values",
y="Output Values (2 * Input)"
)
# Add a subtitle and caption for additional context
gg <- gg + labs(subtitle="A simple linear relationship: Output ~ 2 * Input",
caption="Data generated from a normal distribution")
# Apply a black-and-white theme with minor adjustments to gridlines
gg <- gg + theme_bw() +
theme(
panel.grid.major = element_line(color="gray90", size=0.5), # Light gray gridlines
panel.grid.minor = element_line(color="gray95", size=0.25), # Fainter gridlines
text = element_text(size=12) # Increase text size for readability
)
# Display the plot
print(gg)
model <- lm(output ~ input)
summary(model)
#confint(model) calculates and returns the confidence intervals for each of the coefficients (parameters) of #the model.A confidence interval (CI) provides a range of values that, with a certain level of confidence #(usually 95%), is believed to contain the true value of the model parameter.
#For example, a 95% confidence interval means we are 95% confident that the true coefficient of a parameter #lies within the specified interval.
confint(model)
#Using the county.data dataset, let us run a model of Infections vs ufo2010(Aliens Visits according to the #UFO).
summary(lm(county.data$Infections ~ county.data$ufo2010, data= county.data))
Given the output printed above, is ufo2010 significant at a 5% significance level?1%?
Let us now run a model of Infections vs every single quantitative variable that is included in the dataset.
View(county.data)
summary(lm(Infections ~ pop + income + ipaddr + ufo2010,
data=county.data))
install.packages("carData")
library(car) # for the vif() function
#Let's examine the variance inflation factor (VIF) of the model to assess whether there is a risk of high #correlation between the predictors. Keep in mind that strong correlation between any two predictors could #lead to heteroskedasticity, which would compromise the accuracy of our model.
model <- lm(Infections ~ pop + income + ipaddr + ufo2010,
data=county.data)
sqrt(vif(model))
<!-- rnb-text-end -->
<!-- rnb-chunk-begin -->
<!-- rnb-output-begin eyJkYXRhIjoiXG48IS0tIHJuYi1zb3VyY2UtYmVnaW4gZXlKa1lYUmhJam9pWUdCZ2NseHVJMHhsZENkeklHbHVkbVZ6ZEdsbllYUmxJSGRvWlhSb1pYSWdjRzl3ZFd4aGRHbHZiaUJvWVhNZ1lXNGdhVzF3WVdOMElHOXVJSFJvWlNCdWRXMWlaWElnYjJZZ2FXNW1aV04wYVc5dWN5QnBiaUIwYUdseklHUmhkR0Z6WlhRdUlFWnZjbTExYkdGMFpTQjBhR1VnYm5Wc2JDQmhibVFnWVd4MFpYSnVZWFJwZG1VZ2FIbHdiM1JvWlhObGN5QjBieUIwWlhOMElIUm9hWE1nY21Wc1lYUnBiMjV6YUdsd0xseHVjM1Z0YldGeWVTaHNiU2hKYm1abFkzUnBiMjV6SUg0Z2NHOXdMQ0JrWVhSaFBXTnZkVzUwZVM1a1lYUmhLU2xjYm1CZ1lDSjkgLS0+XG5cbmBgYHJcbiNMZXQncyBpbnZlc3RpZ2F0ZSB3aGV0aGVyIHBvcHVsYXRpb24gaGFzIGFuIGltcGFjdCBvbiB0aGUgbnVtYmVyIG9mIGluZmVjdGlvbnMgaW4gdGhpcyBkYXRhc2V0LiBGb3JtdWxhdGUgdGhlIG51bGwgYW5kIGFsdGVybmF0aXZlIGh5cG90aGVzZXMgdG8gdGVzdCB0aGlzIHJlbGF0aW9uc2hpcC5cbnN1bW1hcnkobG0oSW5mZWN0aW9ucyB+IHBvcCwgZGF0YT1jb3VudHkuZGF0YSkpXG5gYGBcblxuPCEtLSBybmItc291cmNlLWVuZCAtLT5cbiJ9 -->
<!-- rnb-source-begin eyJkYXRhIjoiYGBgclxuI0xldCdzIGludmVzdGlnYXRlIHdoZXRoZXIgcG9wdWxhdGlvbiBoYXMgYW4gaW1wYWN0IG9uIHRoZSBudW1iZXIgb2YgaW5mZWN0aW9ucyBpbiB0aGlzIGRhdGFzZXQuIEZvcm11bGF0ZSB0aGUgbnVsbCBhbmQgYWx0ZXJuYXRpdmUgaHlwb3RoZXNlcyB0byB0ZXN0IHRoaXMgcmVsYXRpb25zaGlwLlxuc3VtbWFyeShsbShJbmZlY3Rpb25zIH4gcG9wLCBkYXRhPWNvdW50eS5kYXRhKSlcbmBgYCJ9 -->
```r
#Let's investigate whether population has an impact on the number of infections in this dataset. Formulate the null and alternative hypotheses to test this relationship.
summary(lm(Infections ~ pop, data=county.data))
#intepret above
#Population is statistically significant in predicting Infections, but the relationship is weak. The R-squared value is very low, meaning that population alone explains only a small portion of the variability in infections.
# model suggests a positive but very small effect of population on infections, with each unit increase in population leading to a very tiny increase in infections.
#Despite the statistical significance of the model, the low R-squared suggests that other factors not included in the model might be influencing the number of infections, and population alone may not be a strong enough predictor.
#Conclusion:
#While population appears to have a statistically significant relationship with the number of infections, the weak explanatory power (low R-squared) suggests that population is not the main factor driving infections in this dataset. Further investigation with additional variables may provide a better model.
pop.lm <- lm(Infections ~ pop, data=county.data)
predict(pop.lm, data.frame(pop=6000000), interval="confidence")
#The model predicts a number of infections between 295.92 and 461.50 with a certain level of confidence, which helps assess the uncertainty around the predicted value. The actual number of infections could fall anywhere within this range. Predicted Infections:
#For a population of 1,000,000, the predicted number of infections is 167.28.
#For a population of 2,000,000, the predicted number of infections is 209.56.
#Summary of Predictions:
#Based on Population:
#For population = 1,000,000, the predicted number of infections is 167.28.
##Based on Income:
####infections, though further analysis may be needed to refine the model and incorporate other important variables.