Package loading
library(tidyverse)
library(readxl)
library(stringr)
library(GGally)
library(corrplot)
library(Hmisc)
library(DataExplorer)
library(knitr)
library(rgdal)
library(ggmap)
library(plotly)
library(corrr)
Loading the dataset
raw <- read_excel("200204_MSc_ALL.xlsx")
Variable cleansing and renaming
df = raw %>%
select(c("id"= "Profile_ID", #ID
L="L*", A = "a*", B = "b*", #colores #1
C = "C*", H= "h*", #colores #2
red = "R",green="G", blue= "B", #colores #3
fed = "Fed", #hierro estas variables son linealmente dependientes "Fe (%) diluted","Fe (%); ppm", Fe (%) fueron eliminadas
'c_per' = "% C", #porcentaje de materia organica
"Fe", "FeError", "Zr", "ZrError", #elementos hierro y circonio
prof_depth = "profile_depth_cm", #profundidad del perfil (cm)
samp_depth = "sample_depth_cm", #profundidad de la muestra (cm)
weight_nod ="weight_N_g",#peso nĂ³dulos de hierro(g)
weight_qz = "weight_Qz_g",#peso de cuarzo (g)
"GPS_U","E_UTM","N_UTM",#info de coordenadas
geology = "Geology_Atlas_Micaschistes_Schistes_Embrechites", #M Micaschistes (up in a hill), SSchistes (slope), E Embrechite (low)
slope = "slope_class_quali", #pendiente
horizon = "Horizon",
pH ="pH H2O",
texture="Texture group", #SUCS
coarse_per="2-0,05",int_per="0,05-0,002",fine_per="<0,002" #granulometrĂa(%)
))
Row cleansing
is_fenster = str_detect(df$id, "Fenster") #window observations that are not important
df = df %>%
mutate(is_fenster = is_fenster) %>%
filter(id != "NA")%>% #NA ids
filter(id != "RK_HYA_A_1" )%>% #testing observation
filter(is_fenster == FALSE)%>% #window observation not important
select(-is_fenster)
Minor transformations
#Transformation of categorical variables into factors
categorical = c('geology', 'slope', 'horizon', 'texture')
df[categorical] <- lapply(df[categorical] , factor)
Coordinate transformation
#transformation of coordinates to long and lat WGS 1984
df$ID <- seq.int(nrow(df)) #setting an index
UTM_32_index = df %>%
filter(GPS_U == 32)%>%
select(ID)
UTM_33_index = df %>%
filter(GPS_U == 33)%>%
select(ID)
UTM_32= df %>%
filter(GPS_U == 32)%>%
select(E_UTM, N_UTM)%>%
SpatialPoints(proj4string=CRS("+proj=utm +zone=32 +datum=WGS84"))%>% #assign UTM
spTransform(CRS("+proj=longlat +datum=WGS84"))%>% #convert to degrees datum WGS84
as.data.frame()
UTM_33= df %>%
filter(GPS_U == 33)%>%
select(E_UTM, N_UTM)%>%
SpatialPoints(proj4string=CRS("+proj=utm +zone=33 +datum=WGS84"))%>%
spTransform(CRS("+proj=longlat +datum=WGS84"))%>%
as.data.frame()
UTM_32['ID'] = UTM_32_index
UTM_33['ID'] = UTM_33_index
lat_lon = rbind(UTM_32, UTM_33)
names(lat_lon) = c('lon', 'lat', 'ID')
df = left_join(df, lat_lon, 'ID')
#discard old coordinates
df = select(df, -c("GPS_U","E_UTM","N_UTM"))
Checking that everything is fine
ggmap::register_google(key = 'AIzaSyAqqancfrHjBacPzVbnSOpex797_tNgDXI')
p <- ggmap(get_googlemap(center = c(lon = mean(df$lon, na.rm = TRUE), lat = mean(df$lat, na.rm = TRUE)),
zoom = 9, scale = 2,
maptype ='terrain',
color = 'color'))
p + geom_point(aes(x = lon, y = lat), data = df, size = 1) +
theme(legend.position="bottom")
it seems that the west part is not over a highway. is this highway under construction? minor road? something is wrong?
Plotting the colors of the samples
#convert rgb to hexadecimal
colores = data.frame('color'=0)
for (i in 1:nrow(df)){
if (is.na(df$red[i])) {
colores[i,1] = '#FFFFFF'
}
else{
colores[i,1] = rgb(df$red[i],df$green[i],df$blue[i], maxColorValue=255)
}
}
#creating a map
ggplot(df, aes(lon, lat))+
geom_jitter(aes(colour = factor(ID)), show.legend=FALSE)+
scale_color_manual(values = colores[,1])
looks ok? white dots = no data
Data structure
plot_intro(df, title = 'DF structure')
Basic statistics
summary(df)
## id L A B
## Length:145 Min. :32.79 Min. : 9.107 Min. :14.73
## Class :character 1st Qu.:41.65 1st Qu.:14.868 1st Qu.:21.40
## Mode :character Median :44.50 Median :16.610 Median :23.26
## Mean :44.13 Mean :16.426 Mean :23.04
## 3rd Qu.:47.04 3rd Qu.:18.358 3rd Qu.:25.13
## Max. :53.27 Max. :20.230 Max. :28.28
## NA's :2 NA's :2 NA's :2
## C H red green
## Min. :19.63 Min. :42.77 Min. :107.6 Min. : 66.97
## 1st Qu.:27.30 1st Qu.:50.19 1st Qu.:134.4 1st Qu.: 86.21
## Median :28.76 Median :53.55 Median :142.0 Median : 93.41
## Mean :28.43 Mean :54.39 Mean :140.8 Mean : 93.42
## 3rd Qu.:30.14 3rd Qu.:58.10 3rd Qu.:148.1 3rd Qu.:101.39
## Max. :32.68 Max. :69.03 Max. :163.6 Max. :118.13
## NA's :2 NA's :2 NA's :2 NA's :2
## blue fed c_per Fe
## Min. :52.10 Min. : 1.984 Min. :0.1050 Min. : 42469
## 1st Qu.:62.05 1st Qu.: 3.957 1st Qu.:0.2182 1st Qu.: 66581
## Median :65.41 Median : 4.609 Median :0.3165 Median : 74808
## Mean :66.20 Mean : 4.862 Mean :0.3367 Mean : 76524
## 3rd Qu.:69.62 3rd Qu.: 5.503 3rd Qu.:0.4412 3rd Qu.: 85667
## Max. :81.00 Max. :11.100 Max. :0.9370 Max. :118069
## NA's :2 NA's :5 NA's :5 NA's :2
## FeError Zr ZrError prof_depth
## Min. : 691.4 Min. :266.5 Min. :12.15 Min. : 28.0
## 1st Qu.:1017.3 1st Qu.:350.0 1st Qu.:13.63 1st Qu.:274.0
## Median :1132.7 Median :388.7 Median :14.34 Median :402.0
## Mean :1154.9 Mean :398.6 Mean :14.49 Mean :391.9
## 3rd Qu.:1274.8 3rd Qu.:442.9 3rd Qu.:15.24 3rd Qu.:495.0
## Max. :1773.9 Max. :711.0 Max. :20.25 Max. :702.0
## NA's :2 NA's :2 NA's :2
## samp_depth weight_nod weight_qz geology slope
## Min. : 18.67 Min. : 0.0000 Min. : 0.0000 E :101 1 :59
## 1st Qu.: 68.14 1st Qu.: 0.2314 1st Qu.: 0.1580 NA's: 44 2 :35
## Median :115.08 Median : 0.8250 Median : 0.3675 3 : 4
## Mean :213.00 Mean : 2.3159 Mean : 1.2891 4 :45
## 3rd Qu.:349.37 3rd Qu.: 2.8611 3rd Qu.: 1.4424 NA's: 2
## Max. :670.94 Max. :26.8524 Max. :21.4690
## NA's :2 NA's :2 NA's :2
## horizon pH texture coarse_per int_per
## Bt : 2 Min. :3.990 C : 7 Min. :20.00 Min. : 3.000
## Bt1 : 2 1st Qu.:4.415 HC : 5 1st Qu.:30.00 1st Qu.: 7.500
## B(t) : 1 Median :4.620 SC : 3 Median :35.00 Median : 8.000
## B(t)1 : 1 Mean :4.605 NA's:130 Mean :36.33 Mean : 8.667
## B(t)7 : 1 3rd Qu.:4.760 3rd Qu.:42.50 3rd Qu.:10.000
## (Other): 8 Max. :5.180 Max. :58.00 Max. :19.000
## NA's :130 NA's :130 NA's :130 NA's :130
## fine_per ID lon lat
## Min. :39.0 Min. : 1 Min. :11.07 Min. :3.824
## 1st Qu.:49.5 1st Qu.: 37 1st Qu.:11.32 1st Qu.:3.873
## Median :57.0 Median : 73 Median :11.61 Median :3.890
## Mean :55.0 Mean : 73 Mean :11.62 Mean :3.885
## 3rd Qu.:60.5 3rd Qu.:109 3rd Qu.:11.94 3rd Qu.:3.897
## Max. :69.0 Max. :145 Max. :12.34 Max. :3.922
## NA's :130 NA's :8 NA's :8
Expertise in the area may notice something weird in a variable.
Missing values visualization
plot_missing(df, group = list(Good = 0.05, OK = 0.4, Bad = 1))
Histograms for detecting univariate anomalies
DataExplorer::plot_histogram(df)
#create df with numeric variables
df_num = select_if(df, is.numeric)
#only looking on L A and B
df_num_col1=df_num %>%
select(-c(C:blue))%>% #other colors
select(-contains('Error'))%>%#measurement errors
select(-ID)%>% #index
select(-c("pH", "coarse_per", "int_per", "fine_per"))#high percent of missing values (numeric)
#correlation matrix is calculated
cor_df_num_col1 = round(cor(df_num_col1,
method = "pearson",
use = 'pairwise.complete.obs'),3)
#p-values are calculated. CI with 95 confidence
cor_df_num_col1_p = cor.mtest(df_num_col1, conf.level = .95)
Correlogram for better visualization
#Correlogram
col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))
corrplot(cor_df_num_col1, method="color", col=col(200),
type="upper",
addCoef.col = "dimgrey", number.cex = 1, # Add coefficient of correlation
tl.col="black", tl.srt=45, tl.cex = 1, #Text label color and rotation and size
# Combine with significance
p.mat = cor_df_num_col1_p$p, sig.level = 0.05, insig = "blank",
# hide correlation coefficient on the principal diagonal
diag=F
)
Notice: color = pearson correlation coefficient
numbers = pearson correlation coefficient
blank = significance levels beneath 0.95
The first three rows gives us an idea of variables that correlate with L, A and B: fed, c_per, Fe, depths, weight nods and qz, and longitud . Notice that L and B are higly correlated.
Network plot.
df_num_col1 %>%
correlate(use = 'complete.obs')%>%
network_plot(min_cor = 0.15) # only showing correlations superior to 0.15.
It is easier to see the relationship between variables looking at its closeness and color intensity.
Scatter plots with regression, correlation coeficient and histograms
ggpairs(df_num_col1, lower = list(continuous = wrap("smooth", alpha = 0.3, size=0.1)))
Analizyng colour behaviour
# plot_ly(df, x = ~red, y = ~green, z = ~blue, color = ~ID, colors = colores[,1]) %>%
# add_markers() %>%
# layout(scene = list(xaxis = list(title = 'L'),
# yaxis = list(title = 'A'),
# zaxis = list(title = 'B')))
plot_ly(df, x = ~L, y = ~A, z = ~B, color = ~ID, colors = colores[,1]) %>%
add_markers() %>%
layout(scene = list(xaxis = list(title = 'L'),
yaxis = list(title = 'A'),
zaxis = list(title = 'B')))
It seems to be a grouping somehow. going from clearer to darker. it is not very clear though…. Later I found out that L and B are better represented in this grouping. It might be interesting to try with combinations. For now, we stick with this.
Grouping colors to generate a scale that could be used as a broader classificator.
df_cluster = df %>%
select(c('A', 'L', 'B', 'ID'))%>%
filter(!is.na(df$A))
set.seed(100) #seed to attain same results
clusters = kmeans(df_cluster[,1:3],3) #kmeans with 3 groups
df_cluster$c_group = clusters$cluster
df_cluster = select(df_cluster,-c('A', 'L', 'B'))
#transformo la matriz para asignarle la categorĂa segĂºn el clĂºster
df_cluster = df_cluster %>%
mutate(category = as.factor(ifelse(c_group ==1, 'dark',
ifelse(c_group ==2, 'light',
ifelse(c_group == 3, 'medium', NA)))))%>%
select(-c_group)
df_cluster = left_join(df, df_cluster, by='ID')
df_cluster$category = factor(df_cluster$category)
plot_ly(df_cluster, x = ~L, y = ~A, z = ~B, color = ~ID, colors = colores[,1],symbol = ~category, symbols = c('circle', 'x', 'diamond') ) %>%
add_markers() %>%
layout(scene = list(xaxis = list(title = 'L'),
yaxis = list(title = 'A'),
zaxis = list(title = 'B')))
## Warning: Ignoring 2 observations
Nice visualizations with categories
category=df_cluster$category
df_num_col1_clust = cbind(df_num_col1, category)
ggpairs(df_num_col1_clust,aes(colour = category, alpha = 0.4))
Very interesting graph!!! you can see a lot of nice things in here related to the color grouping. That’s it for now =) espero q te vaya sirviendo