# 1) Data Exploration: This should include summary statistics, means, medians, quartiles, or any
# other relevant information about the data set.
library(readxl)
quakes <- read_excel("C:/Users/jmcon/Downloads/quakes.xlsx")
## New names:
## * `` -> ...1
View(quakes)
summary(quakes$mag)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.00 4.30 4.60 4.62 4.90 6.40
summary(quakes$stations)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 10.00 18.00 27.00 33.42 42.00 132.00
summary(quakes$depth)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 40.0 99.0 247.0 311.4 543.0 680.0
var(quakes$mag)
## [1] 0.1622261
sd(quakes$mag)
## [1] 0.402773
# Data wrangling: Please perform some basic transformations. They will need to make sense but
# could include column renaming, creating a subset of the data, replacing values, or creating new
# columns with derived data (for example - if it makes sense you could sum two columns together)
# take a sample range of the data
# rename abbreviations to longitude, latitude, magnitude
# Drop numbered column
samples <- quakes[c(100:250, 700:1000),c(2:6)]
print(samples)
## # A tibble: 452 x 5
## lat long depth mag stations
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 -24.6 180. 484 4.7 33
## 2 -17.0 186. 108 4.1 12
## 3 -26.2 178. 583 4.6 25
## 4 -21.9 180. 608 4.7 30
## 5 -33 182. 72 4.7 22
## 6 -21.3 181. 636 4.6 29
## 7 -19.4 184. 293 4.2 15
## 8 -34.9 181. 42 4.4 25
## 9 -20.2 169. 100 4.6 22
## 10 -22.6 186. 42 5.7 76
## # ... with 442 more rows
names(samples) <- c("Latitude", "Longitude", "Depth", "Magnitude", "Station")
print(samples)
## # A tibble: 452 x 5
## Latitude Longitude Depth Magnitude Station
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 -24.6 180. 484 4.7 33
## 2 -17.0 186. 108 4.1 12
## 3 -26.2 178. 583 4.6 25
## 4 -21.9 180. 608 4.7 30
## 5 -33 182. 72 4.7 22
## 6 -21.3 181. 636 4.6 29
## 7 -19.4 184. 293 4.2 15
## 8 -34.9 181. 42 4.4 25
## 9 -20.2 169. 100 4.6 22
## 10 -22.6 186. 42 5.7 76
## # ... with 442 more rows
summary(samples)
## Latitude Longitude Depth Magnitude
## Min. :-38.59 Min. :165.8 Min. : 40.0 Min. :4.000
## 1st Qu.:-23.47 1st Qu.:179.7 1st Qu.:100.0 1st Qu.:4.300
## Median :-20.52 Median :181.5 Median :242.0 Median :4.500
## Mean :-20.68 Mean :179.7 Mean :310.2 Mean :4.596
## 3rd Qu.:-17.82 3rd Qu.:183.5 3rd Qu.:543.2 3rd Qu.:4.800
## Max. :-10.80 Max. :188.1 Max. :664.0 Max. :6.400
## Station
## Min. : 10.00
## 1st Qu.: 18.00
## Median : 27.00
## Mean : 33.68
## 3rd Qu.: 42.00
## Max. :132.00
sum(samples$Station)
## [1] 15223
# Graphics: Please make sure to display at least one scatter plot, box plot and histogram. Don't
# be limited to this. Please explore the many other options in R packages such as ggplot2.
boxplot(samples$Depth)

boxplot(samples$Magnitude)

hist(samples$Depth)

hist(samples$Magnitude)

# Locations of Earthquakes in NZ from "sample"
library(leaflet)
leaflet() %>%
addTiles() %>%
addMarkers(lng = samples$Longitude, lat = samples$Latitude,
popup = "EQ")
# The relation of magnitude to stations: Does Y cause X
# State hypothesis:
'Hypothesis: There will be a significant correlation between the magnitude of earthquakes
and number of stations'
## [1] "Hypothesis: There will be a significant correlation between the magnitude of earthquakes\nand number of stations"
# The predictor vector. Stations
x <- samples$Station
# The resposne vector. Magnitude
y <- samples$Magnitude
# Apply the lm() function.
relation <- lm(y~x)
lm(formula = y ~ x)
##
## Call:
## lm(formula = y ~ x)
##
## Coefficients:
## (Intercept) x
## 4.05434 0.01609
a <- data.frame(x = 27)
result <- predict(relation,a)
print(result)
## 1
## 4.48877
relation <- lm(y~x)
# Give the chart file a name.
png(file = "linearregressionEM2.png")
# Plot the chart.
plot(x,y,col = "blue",main = "LR of Magnitude and Frequency of Stations",
abline(lm(y~x)),cex = 1.3,pch = 16,xlab = "Stations",ylab = "Magnitude")
#dev.off()
summary(relation)
##
## Call:
## lm(formula = y ~ x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.58531 -0.15745 -0.02381 0.13995 0.98822
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.0543367 0.0191573 211.63 <2e-16 ***
## x 0.0160901 0.0004777 33.68 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2211 on 450 degrees of freedom
## Multiple R-squared: 0.716, Adjusted R-squared: 0.7154
## F-statistic: 1135 on 1 and 450 DF, p-value: < 2.2e-16
relation$coefficients # Describe
## (Intercept) x
## 4.05433675 0.01609011
anova(relation) # Describe
## Analysis of Variance Table
##
## Response: y
## Df Sum Sq Mean Sq F value Pr(>F)
## x 1 55.480 55.480 1134.6 < 2.2e-16 ***
## Residuals 450 22.004 0.049
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
'Due to the size of the P-value, which is less than all signifcant values of alpha,
we can conclude to reject our null hypothesis, and say there is a signifcant correlation
of magnitude and frequency of stations'
## [1] "Due to the size of the P-value, which is less than all signifcant values of alpha,\nwe can conclude to reject our null hypothesis, and say there is a signifcant correlation\nof magnitude and frequency of stations"