#Reading the data
mydata <- read.table("./weatherAUS.csv", header=TRUE, sep = ",", dec = ",")
#Including only needed data
mydata <- mydata[, c(2,5,9,12,13,14,15,18,19)]
#Setting initial point of sampling
set.seed(1)
#Random sample of 100 units
#mydata1 <- mydata[sample(nrow(mydata), 100, replace = TRUE),]
#Showing first 6 rows of the data
head(mydata)
## Location Rainfall WindGustSpeed WindSpeed9am WindSpeed3pm
## 1 Albury 0.6 44 20 24
## 2 Albury 0 44 4 22
## 3 Albury 0 46 19 26
## 4 Albury 0 24 11 9
## 5 Albury 1 41 7 20
## 6 Albury 0.2 56 19 24
## Humidity9am Humidity3pm Cloud9am Cloud3pm
## 1 71 22 8 NA
## 2 44 25 NA NA
## 3 38 30 NA 2
## 4 45 16 NA NA
## 5 82 33 7 8
## 6 55 23 NA NA
Unit of observation: 1 location in Australia
Sample size: 74624
#checking if there are any missing values in the dataset
any_na <- any(is.na(mydata))
if (any_na) {
na_values <- which(is.na(mydata), arr.ind = TRUE)
}
#Removing rows with NA values
mydata <- na.omit(mydata)
any_na <- any(is.na(mydata))
print(any_na)
## [1] FALSE
#Descriptive statistics
library(psych)
summary(mydata)
## Location Rainfall WindGustSpeed
## Length:74624 Length:74624 Min. : 7.0
## Class :character Class :character 1st Qu.: 31.0
## Mode :character Mode :character Median : 39.0
## Mean : 40.7
## 3rd Qu.: 48.0
## Max. :126.0
## WindSpeed9am WindSpeed3pm Humidity9am Humidity3pm
## Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 0.00
## 1st Qu.: 9.00 1st Qu.:13.00 1st Qu.: 56.00 1st Qu.: 36.00
## Median :15.00 Median :19.00 Median : 69.00 Median : 52.00
## Mean :15.27 Mean :19.52 Mean : 67.64 Mean : 50.95
## 3rd Qu.:20.00 3rd Qu.:24.00 3rd Qu.: 82.00 3rd Qu.: 65.00
## Max. :69.00 Max. :76.00 Max. :100.00 Max. :100.00
## Cloud9am Cloud3pm
## Min. :0.000 Min. :0.000
## 1st Qu.:1.000 1st Qu.:2.000
## Median :5.000 Median :5.000
## Mean :4.441 Mean :4.512
## 3rd Qu.:7.000 3rd Qu.:7.000
## Max. :8.000 Max. :9.000
#install.packages("pastecs")
mydata_PCA <- mydata[ ,c("WindGustSpeed", "WindSpeed9am", "WindSpeed3pm", "Humidity9am", "Humidity3pm", "Cloud9am", "Cloud3pm")]
library(pastecs)
#correlation matrix
R <- cor(mydata_PCA)
round(R, 3)
## WindGustSpeed WindSpeed9am WindSpeed3pm Humidity9am
## WindGustSpeed 1.000 0.610 0.689 -0.186
## WindSpeed9am 0.610 1.000 0.510 -0.233
## WindSpeed3pm 0.689 0.510 1.000 -0.099
## Humidity9am -0.186 -0.233 -0.099 1.000
## Humidity3pm -0.031 -0.042 0.026 0.703
## Cloud9am 0.078 0.027 0.059 0.464
## Cloud3pm 0.113 0.053 0.031 0.373
## Humidity3pm Cloud9am Cloud3pm
## WindGustSpeed -0.031 0.078 0.113
## WindSpeed9am -0.042 0.027 0.053
## WindSpeed3pm 0.026 0.059 0.031
## Humidity9am 0.703 0.464 0.373
## Humidity3pm 1.000 0.532 0.536
## Cloud9am 0.532 1.000 0.607
## Cloud3pm 0.536 0.607 1.000
Correlation between the variables should be at least /0.3/ (in absolute). We can clearly see that correlation between some variables are above 0.3. At this point I should not continue but due to academic reasons (and problems finding the appropriate data), I will continue with the analysis.
"Bartlett's test of sphericity"
## [1] "Bartlett's test of sphericity"
cortest.bartlett(R, n = nrow(mydata_PCA))
## $chisq
## [1] 215827.2
##
## $p.value
## [1] 0
##
## $df
## [1] 21
H0: P=I H1: P=/I Based on this sample data, we can reject H0 and conclude that at least one correlation differ from 0(p<0.001).
det(R)
## [1] 0.05544534
det(R) should be above 0.00001 but this is more important when doing factor analysis.
#Kaiser-Mayer-Olkin test
KMO(R)
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = R)
## Overall MSA = 0.7
## MSA for each item =
## WindGustSpeed WindSpeed9am WindSpeed3pm Humidity9am Humidity3pm
## 0.65 0.76 0.67 0.68 0.69
## Cloud9am Cloud3pm
## 0.78 0.72
KMO and MSA should be above 0.5. We are good.
So, regarding the fact that Bartlett test of specificity is okay, KMO and MSA are higher than 0.5 and assume that we have high correlation, the data should be suitable.
#How many components?
library(FactoMineR)
components <- PCA(mydata_PCA,
scale.unit = TRUE,
graph = FALSE
)
library(factoextra)
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
get_eigenvalue(components)
## eigenvalue variance.percent cumulative.variance.percent
## Dim.1 2.6239974 37.485677 37.48568
## Dim.2 2.2674185 32.391693 69.87737
## Dim.3 0.6980115 9.971593 79.84896
## Dim.4 0.4841020 6.915743 86.76471
## Dim.5 0.3995835 5.708335 92.47304
## Dim.6 0.2830503 4.043576 96.51662
## Dim.7 0.2438369 3.483384 100.00000
#scree plot
library(factoextra)
fviz_eig(components,
choice = "eigenvalue",
main = "Scree plot",
ylab = "Eigenvalue",
xlab = "Principal component",
addlabels = TRUE)
The biggest change in slope is at the second component, so i should take
1 but 1 is not an option.
#Parallel Analysis
library(psych)
fa.parallel(mydata_PCA,
sim = FALSE,
fa = "pc",
)
## Parallel analysis suggests that the number of factors = NA and the number of components = 2
#2 PC
library(FactoMineR)
components <- PCA(mydata_PCA,
scale.unit = TRUE,
graph = FALSE,
ncp = 2)
components
## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 74624 individuals, described by 7 variables
## *The results are available in the following objects:
##
## name description
## 1 "$eig" "eigenvalues"
## 2 "$var" "results for the variables"
## 3 "$var$coord" "coord. for the variables"
## 4 "$var$cor" "correlations variables - dimensions"
## 5 "$var$cos2" "cos2 for the variables"
## 6 "$var$contrib" "contributions of the variables"
## 7 "$ind" "results for the individuals"
## 8 "$ind$coord" "coord. for the individuals"
## 9 "$ind$cos2" "cos2 for the individuals"
## 10 "$ind$contrib" "contributions of the individuals"
## 11 "$call" "summary statistics"
## 12 "$call$centre" "mean of the variables"
## 13 "$call$ecart.type" "standard error of the variables"
## 14 "$call$row.w" "weights for the individuals"
## 15 "$call$col.w" "weights for the variables"
print(components$var$cor)
## Dim.1 Dim.2
## WindGustSpeed -0.1397851 0.8889843
## WindSpeed9am -0.1894742 0.7973667
## WindSpeed3pm -0.1123595 0.8329491
## Humidity9am 0.8145468 -0.1204345
## Humidity3pm 0.8583730 0.1092632
## Cloud9am 0.7773065 0.2351017
## Cloud3pm 0.7425874 0.2565379
print(components$var$contrib)
## Dim.1 Dim.2
## WindGustSpeed 0.7446609 34.8543110
## WindSpeed9am 1.3681595 28.0404214
## WindSpeed3pm 0.4811229 30.5988629
## Humidity9am 25.2853351 0.6396911
## Humidity3pm 28.0794600 0.5265216
## Cloud9am 23.0261462 2.4376978
## Cloud3pm 21.0151155 2.9024942
library(factoextra)
fviz_pca_var(components,
repel = TRUE)
library(factoextra)
fviz_pca_biplot(components)
head(components$ind$coord)
## Dim.1 Dim.2
## 5 1.046362 -0.2277070
## 12 2.877003 -0.3981559
## 13 2.096752 2.7383300
## 17 1.084362 -1.6820571
## 18 0.659941 0.4091306
## 30 1.916516 0.7398658
components$ind$coord[18, ]
## Dim.1 Dim.2
## 2.485754 -1.070475
mydata_PCA_std <- scale(mydata_PCA)
mydata_PCA_std[18, ]
## WindGustSpeed WindSpeed9am WindSpeed3pm Humidity9am Humidity3pm
## -0.1259713 -0.9589171 -1.7999475 0.3323609 1.6735580
## Cloud9am Cloud3pm
## 1.2384725 1.2861955
mydata$PC1 <- components$ind$coord[ , 1]
mydata$PC2 <- components$ind$coord[ , 2]
head(mydata)
## Location Rainfall WindGustSpeed WindSpeed9am WindSpeed3pm
## 5 Albury 1 41 7 20
## 12 Albury 2.2 31 15 13
## 13 Albury 15.6 61 28 28
## 17 Albury 0 22 11 9
## 18 Albury 16.8 63 6 20
## 30 Albury 1.2 50 11 22
## Humidity9am Humidity3pm Cloud9am Cloud3pm PC1 PC2
## 5 82 33 7 8 1.046362 -0.2277070
## 12 89 91 8 8 2.877003 -0.3981559
## 13 76 93 8 8 2.096752 2.7383300
## 17 69 82 8 1 1.084362 -1.6820571
## 18 80 65 8 1 0.659941 0.4091306
## 30 78 70 8 8 1.916516 0.7398658
cor(x=mydata$PC1, y=mydata$PC2)
## [1] -2.00817e-14
We managed to reduce variables and therefore simplify the data but still manage to explain around 70% of the total variance.