rm(list=ls())
if (!require('openxlsx')) install.packages('openxlsx')
library('openxlsx')
if (!require('ggplot2')) install.packages('ggplot2')
library('ggplot2')
if (!require('EnvStats')) install.packages('EnvStats')
library('EnvStats')
if (!require('DescTools')) install.packages('DescTools')
library('DescTools')
if (!require('seas')) install.packages('seas')
library('seas')
if (!require('data.table')) install.packages('data.table')
library('data.table')
if (!require('plotmo')) install.packages('plotmo')
library('plotmo')
if (!require('GGally')) install.packages('GGally')
library('GGally')
First, read the data from the excel file. This excel file needs to be in the R working directory.Alternatively, we would need to include the path the excel file.This data is from Table A.3, page 618 of Wilks' book on Statistical methods in the Atmospheric Sciences (3rd Ed). It is the June climate data for Guayaquil, Ecuador, for 1951-1970.
df = read.xlsx(xlsxFile = "C:\\Users\\Abdul Mannan\\Documents\\Desktop\\MET671\\tableA3.xlsx")
str(df)
## 'data.frame': 20 obs. of 5 variables:
## $ Year : num 1951 1952 1953 1954 1955 ...
## $ El_Nino : chr "T" "F" "T" "F" ...
## $ temp_C : num 26.1 24.5 24.8 24.5 24.1 24.3 26.4 24.9 23.7 23.5 ...
## $ precip_mm : num 43 10 4 0 2 NA 31 0 0 0 ...
## $ pressure_mb: num 1010 1011 1011 1011 1012 ...
any(is.na(df))
## [1] TRUE
mean.precip = mean(df$`precip_mm`, na.rm=TRUE) #The na.rm will ignore NA (missing) value observation during calculations
mean.precip
## [1] 12.94737
median.precip = median(df$`precip_mm`, na.rm=TRUE)
median.precip
## [1] 2
lowerq.precip = as.numeric(quantile(df$`precip_mm`, probs = c(0.25) , na.rm=TRUE))
upperq.precip = as.numeric(quantile(df$`precip_mm`, probs = c(0.75) , na.rm=TRUE))
trimean.precip = (lowerq.precip+2*median.precip+upperq.precip)/4
trimean.precip
## [1] 2.75
The median is the least sensitive, as one or two outliers will only move the median one or two positions left or right. The trimean would be the next most sensitive, because not only does it depend on quartiles, which will move more sluggishly when you add in a few outliers (since they are calculated with many other pieces of data) but also because the mean in the trimnean calculation is multiplied by 2*(1/4), further watering down the affect. The most susceptible would be the mean.
The MeanAD function calculates the mean absolute deviation from the mean value (or from another supplied center point) of x, after having removed NA values
1/n sum(|x_i - c|) where c=mean(x) or c=med(x)
The function supports the use of weights. The default function for the center value Mean() has a weights arguments, too. If a user defined function is used it must be assured that it has a weights argument. Ref:http://finzi.psych.upenn.edu/R/library/DescTools/html/MeanAD.html
Median.Abs.Dev.pressure = MeanAD(df$`pressure_mb`, weights = NULL, center = median, na.rm = TRUE)
Median.Abs.Dev.pressure
## [1] 0.63
Mean.Abs.Dev.pressure = MeanAD(df$`pressure_mb`, weights = NULL, center = mean, na.rm = TRUE)
Mean.Abs.Dev.pressure
## [1] 0.65
IQR.pressure = IQR(df$`pressure_mb`, na.rm = TRUE, type = 7)
IQR.pressure
## [1] 0.75
#One of the nine quantile algorithms discussed in Hyndman and Fan (1996), selected by type, is employed
#Type 7 (Default): m = 1-p. p[k] = (k - 1) / (n - 1). In this case, p[k] = mode[F(x[k])]. This is used by S.
#Ref: R Documentation
SD.pressure = sd(df$`pressure_mb`)
SD.pressure
## [1] 0.8798923
stem(df$`temp_C`, scale = 1, width = 80, atom = 1e-08)
##
## The decimal point is at the |
##
## 23 | 577
## 24 | 011334556889
## 25 | 2
## 26 | 1468
#scale: This controls the plot length.A value of scale = 2 will cause the plot to be roughly twice as long as the default.
#width: The desired width of plot
#atom: a tolerance
#Infinite and missing values in (df) are discarded.
Ref: https://r.789695.n4.nabble.com/Yule-Kendall-resistant-measure-of-skewness-td860734.html
lowerq.temp = as.numeric(quantile(df$`temp_C`, probs = c(0.25) , na.rm=TRUE))
median.temp = as.numeric(quantile(df$`temp_C`, probs = c(0.5) , na.rm=TRUE))
upperq.temp = as.numeric(quantile(df$`temp_C`, probs = c(0.75) , na.rm=TRUE))
IQR.temp = IQR(df$`temp_C`,na.rm = TRUE)
YKI.temp = (lowerq.temp-2*median.temp+upperq.temp/IQR.temp)
YKI.temp
## [1] 3.642857
Ref: http://finzi.psych.upenn.edu/R/library/EnvStats/html/skewness.html
skewness.coeff.temp=skewness(df$`temp_C`, na.rm = FALSE, method = "fisher", l.moment.method = "unbiased", plot.pos.cons = c(a = 0.35, b = 0))
skewness.coeff.temp
## [1] 0.9740256
The e.c.d.f. (empirical cumulative distribution function) Fn is a step function with jumps i/n at observation values, where i is the number of tied observations at that value. Missing values are ignored.
For observations x= (x1,x2, ... xn), Fn is the fraction of observations less or equal to t, i.e.,
Fn(t) = #{xi <= t}/n = 1/n sum(i=1,n) Indicator(xi <= t)
Ref: http://finzi.psych.upenn.edu/R/library/stats/html/ecdf.html
fn = ecdf(df$`pressure_mb`)
plot(fn, xlab=("Pressure"), ylab=("ECDF"), main = paste("Empirical Cumulative Distribution for Pressure Data"))
df = as.data.frame(df)
ggplot(df, aes(pressure_mb))+labs(
x="Pressure (mb)",
y="Fn(Pressure)",
title="Empirical Cumulative Frequency Distribution of Pressure data"
)+stat_ecdf()
ggplot(df, aes(pressure_mb))+geom_histogram()+labs(
x="Pressure (mb)",
y="Frequency",
title="Frequency Distribution of Pressure data"
)
dt <- df$pressure_mb
h <- hist(
dt,
breaks = 40,
main=paste("Frequency Vs ECDF graph for pressure data"),
xlab = "Pressure (mb)"
)
par(new = T)
ec <- ecdf(dt)
plot(x = h$mids, y=ec(h$mids)*max(h$counts), col = rgb(0,0,0,alpha=0), axes=F, xlab=NA, ylab=NA)
lines(x = h$mids, y=ec(h$mids)*max(h$counts), col ='red')
axis(4, at=seq(from = 0, to = max(h$counts), length.out = 11), labels=seq(0, 1, 0.1), col = 'red', col.axis = 'red')
mtext(side = 4, line = 3, 'Cumulative Density', col = 'red')
ggplot(df, aes(x=precip_mm))+geom_boxplot()
curve(dnorm(x,0,1), xlim=c(-3,3),lwd=3,main="Confidence Intervals
and Confidence Levels",xlab="True mean as a random variable",ylab="Probability density",xaxt="n",cex.lab=1.3)
polygon(c(-1.96, seq(-1.96,1.96,len=100), 1.96),c(0,dnorm(seq(-1.96,1.96,len=100)),0),col="skyblue")
polygon(c(-1.0,seq(-1.0, 1, length=100), 1),
c(0, dnorm(seq(-1.0, 1, length=100)), 0.0),col="white")
polygon(c(-3.0,seq(-3.0, -1.96, length=100), -1.96),
c(0, dnorm(seq(-3.0, -1.96, length=100)), 0.0),col="red")
s=sd(df$temp_C)
u=mean(df$temp_C)
V= c(df$temp_C)
anomaly=V-u
std.anomaly=anomaly/s
timeyr<-seq(1951, 1970)
plot(timeyr,std.anomaly,type="s",
cex.lab=1, lwd=2,
xlab="Year", ylab="STD Temperature anomaly [oC]",
main="Temperature anomalies: 1951-1970")
lines(timeyr, rep(0,20),type="l",col="red")
std.anomaly.1951=std.anomaly-std.anomaly[1]
timeyr<-seq(1951, 1970)
plot(timeyr,std.anomaly.1951,type="s",
cex.lab=1, lwd=2,
xlab="Year", ylab="STD Temperature anomaly [oC]",
main="Temperature anomalies: 1951-1970
w.r.t. June 1951")
lines(timeyr, rep(0,20),type="l",col="red")
ggplot(df, aes(x=temp_C, y=pressure_mb))+geom_point()+geom_smooth(se=FALSE, method="lm")+labs(title = "Scatter Plot between Temperature and Pressure", x="Temperature [oC]", y="Pressure [mb]")
## `geom_smooth()` using formula 'y ~ x'
df=na.omit(df)
# Grab only numeric columns
num.cols <- sapply(df, is.numeric)
# Filter to numeric columns for correlation
cor.data <- cor(df[,num.cols],use="everything", method="pearson")
cor.data
## Year temp_C precip_mm pressure_mb
## Year 1.00000000 0.1053675 0.1559889 0.08738603
## temp_C 0.10536751 1.0000000 0.7028529 -0.83002211
## precip_mm 0.15598886 0.7028529 1.0000000 -0.67753439
## pressure_mb 0.08738603 -0.8300221 -0.6775344 1.00000000
#Lets look at our correlation data
corrplot::corrplot(cor.data, method='color')
df=na.omit(df)
# Grab only numeric columns
num.cols <- sapply(df, is.numeric)
# Filter to numeric columns for correlation
cor.data <- cor(df[,num.cols],use="everything", method="spearman")
cor.data
## Year temp_C precip_mm pressure_mb
## Year 1.00000000 0.1203868 -0.09939439 0.06502646
## temp_C 0.12038683 1.0000000 0.60638559 -0.69674296
## precip_mm -0.09939439 0.6063856 1.00000000 -0.63195066
## pressure_mb 0.06502646 -0.6967430 -0.63195066 1.00000000
#Lets look at our correlation data
corrplot::corrplot(cor.data, method='color')
temp=matrix(df$temp_C)
precip=matrix(df$precip_mm)
pres=matrix(df$pressure_mb)
stars(df)