MSDS-R-final.R

# 1) Data Exploration: This should include summary statistics, means, medians, quartiles, or any
# other relevant information about the data set.

library(readxl)
quakes <- read_excel("C:/Users/jmcon/Downloads/quakes.xlsx")

## New names:
## * `` -> ...1

View(quakes)


summary(quakes$mag)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    4.00    4.30    4.60    4.62    4.90    6.40

summary(quakes$stations)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   10.00   18.00   27.00   33.42   42.00  132.00

summary(quakes$depth)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    40.0    99.0   247.0   311.4   543.0   680.0

var(quakes$mag)

## [1] 0.1622261

sd(quakes$mag)

## [1] 0.402773

# Data wrangling: Please perform some basic transformations. They will need to make sense but
# could include column renaming, creating a subset of the data, replacing values, or creating new
# columns with derived data (for example - if it makes sense you could sum two columns together)
                           
#     take a sample range of the data
#     rename abbreviations to longitude, latitude, magnitude
#     Drop numbered column


samples <- quakes[c(100:250, 700:1000),c(2:6)]
print(samples)

## # A tibble: 452 x 5
##      lat  long depth   mag stations
##    <dbl> <dbl> <dbl> <dbl>    <dbl>
##  1 -24.6  180.   484   4.7       33
##  2 -17.0  186.   108   4.1       12
##  3 -26.2  178.   583   4.6       25
##  4 -21.9  180.   608   4.7       30
##  5 -33    182.    72   4.7       22
##  6 -21.3  181.   636   4.6       29
##  7 -19.4  184.   293   4.2       15
##  8 -34.9  181.    42   4.4       25
##  9 -20.2  169.   100   4.6       22
## 10 -22.6  186.    42   5.7       76
## # ... with 442 more rows

names(samples) <- c("Latitude", "Longitude", "Depth", "Magnitude", "Station")
print(samples)

## # A tibble: 452 x 5
##    Latitude Longitude Depth Magnitude Station
##       <dbl>     <dbl> <dbl>     <dbl>   <dbl>
##  1    -24.6      180.   484       4.7      33
##  2    -17.0      186.   108       4.1      12
##  3    -26.2      178.   583       4.6      25
##  4    -21.9      180.   608       4.7      30
##  5    -33        182.    72       4.7      22
##  6    -21.3      181.   636       4.6      29
##  7    -19.4      184.   293       4.2      15
##  8    -34.9      181.    42       4.4      25
##  9    -20.2      169.   100       4.6      22
## 10    -22.6      186.    42       5.7      76
## # ... with 442 more rows

summary(samples)

##     Latitude        Longitude         Depth         Magnitude    
##  Min.   :-38.59   Min.   :165.8   Min.   : 40.0   Min.   :4.000  
##  1st Qu.:-23.47   1st Qu.:179.7   1st Qu.:100.0   1st Qu.:4.300  
##  Median :-20.52   Median :181.5   Median :242.0   Median :4.500  
##  Mean   :-20.68   Mean   :179.7   Mean   :310.2   Mean   :4.596  
##  3rd Qu.:-17.82   3rd Qu.:183.5   3rd Qu.:543.2   3rd Qu.:4.800  
##  Max.   :-10.80   Max.   :188.1   Max.   :664.0   Max.   :6.400  
##     Station      
##  Min.   : 10.00  
##  1st Qu.: 18.00  
##  Median : 27.00  
##  Mean   : 33.68  
##  3rd Qu.: 42.00  
##  Max.   :132.00

sum(samples$Station)

## [1] 15223

# Graphics: Please make sure to display at least one scatter plot, box plot and histogram. Don't
# be limited to this. Please explore the many other options in R packages such as ggplot2.

boxplot(samples$Depth)

boxplot(samples$Magnitude)

hist(samples$Depth)

hist(samples$Magnitude)

# Locations of Earthquakes in NZ from "sample"
library(leaflet)
leaflet() %>%
  addTiles() %>%
  addMarkers(lng = samples$Longitude, lat = samples$Latitude,
             popup = "EQ")

# The relation of  magnitude to stations: Does Y cause X
# State  hypothesis:
'Hypothesis: There will be a significant correlation between the magnitude of earthquakes
and number of stations'

## [1] "Hypothesis: There will be a significant correlation between the magnitude of earthquakes\nand number of stations"

# The predictor vector. Stations
x <- samples$Station

# The resposne vector. Magnitude
y <- samples$Magnitude

# Apply the lm() function.
relation <- lm(y~x)
lm(formula = y ~ x)

## 
## Call:
## lm(formula = y ~ x)
## 
## Coefficients:
## (Intercept)            x  
##     4.05434      0.01609

a <- data.frame(x = 27)
result <-  predict(relation,a)
print(result)

##       1 
## 4.48877

relation <- lm(y~x)


# Give the chart file a name.
png(file = "linearregressionEM2.png")


# Plot the chart.

plot(x,y,col = "blue",main = "LR of Magnitude and Frequency of Stations",
     abline(lm(y~x)),cex = 1.3,pch = 16,xlab = "Stations",ylab = "Magnitude")
#dev.off()

summary(relation)

## 
## Call:
## lm(formula = y ~ x)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.58531 -0.15745 -0.02381  0.13995  0.98822 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 4.0543367  0.0191573  211.63   <2e-16 ***
## x           0.0160901  0.0004777   33.68   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2211 on 450 degrees of freedom
## Multiple R-squared:  0.716,  Adjusted R-squared:  0.7154 
## F-statistic:  1135 on 1 and 450 DF,  p-value: < 2.2e-16

relation$coefficients # Describe

## (Intercept)           x 
##  4.05433675  0.01609011

anova(relation) # Describe

## Analysis of Variance Table
## 
## Response: y
##            Df Sum Sq Mean Sq F value    Pr(>F)    
## x           1 55.480  55.480  1134.6 < 2.2e-16 ***
## Residuals 450 22.004   0.049                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

'Due to the size of the P-value, which is less than all signifcant values of alpha,
we can conclude to reject our null hypothesis, and say there is a signifcant correlation
of magnitude and frequency of stations'

## [1] "Due to the size of the P-value, which is less than all signifcant values of alpha,\nwe can conclude to reject our null hypothesis, and say there is a signifcant correlation\nof magnitude and frequency of stations"

MSDS-R-final.R

jmcon

2020-08-01