Package loading

library(tidyverse)
library(readxl)
library(stringr)
library(GGally)
library(corrplot)
library(Hmisc)
library(DataExplorer)
library(knitr)
library(rgdal)
library(ggmap)
library(plotly)
library(corrr)

Dataset Preparation

Loading the dataset

raw <- read_excel("200204_MSc_ALL.xlsx")

Variable cleansing and renaming

df = raw %>%
  select(c("id"= "Profile_ID", #ID
           L="L*", A = "a*", B = "b*", #colores #1
           C = "C*", H= "h*", #colores #2
           red = "R",green="G", blue= "B", #colores #3
           fed = "Fed", #hierro estas variables son linealmente dependientes "Fe (%) diluted","Fe (%); ppm", Fe (%) fueron eliminadas
           'c_per' = "% C", #porcentaje de materia organica
           "Fe", "FeError", "Zr", "ZrError", #elementos hierro y circonio
           prof_depth = "profile_depth_cm", #profundidad del perfil (cm)
           samp_depth = "sample_depth_cm", #profundidad de la muestra (cm)
           weight_nod ="weight_N_g",#peso nĂ³dulos de hierro(g)
           weight_qz = "weight_Qz_g",#peso de cuarzo (g)
           "GPS_U","E_UTM","N_UTM",#info de coordenadas
           geology = "Geology_Atlas_Micaschistes_Schistes_Embrechites", #M Micaschistes (up in a hill), SSchistes (slope), E Embrechite (low)
           slope = "slope_class_quali", #pendiente
           horizon = "Horizon",
           pH ="pH H2O", 
           texture="Texture group", #SUCS 
           coarse_per="2-0,05",int_per="0,05-0,002",fine_per="<0,002" #granulometrĂ­a(%)
           ))

Row cleansing

is_fenster = str_detect(df$id, "Fenster") #window observations that are not important

df = df %>%
  mutate(is_fenster = is_fenster) %>%
  filter(id != "NA")%>% #NA ids
  filter(id != "RK_HYA_A_1" )%>% #testing observation
  filter(is_fenster == FALSE)%>% #window observation not important
  select(-is_fenster)

Minor transformations

#Transformation of categorical variables into factors
categorical = c('geology', 'slope', 'horizon', 'texture')
df[categorical] <- lapply(df[categorical] , factor)

Coordinate transformation

#transformation of coordinates to long and lat WGS 1984
df$ID <- seq.int(nrow(df)) #setting an index

UTM_32_index = df %>%
  filter(GPS_U == 32)%>%
  select(ID)

UTM_33_index = df %>%
  filter(GPS_U == 33)%>%
  select(ID)

UTM_32= df %>%
  filter(GPS_U == 32)%>%
  select(E_UTM, N_UTM)%>%
  SpatialPoints(proj4string=CRS("+proj=utm +zone=32 +datum=WGS84"))%>% #assign UTM
  spTransform(CRS("+proj=longlat +datum=WGS84"))%>% #convert to degrees datum WGS84
  as.data.frame()

UTM_33= df %>%
  filter(GPS_U == 33)%>%
  select(E_UTM, N_UTM)%>%
  SpatialPoints(proj4string=CRS("+proj=utm +zone=33 +datum=WGS84"))%>%
  spTransform(CRS("+proj=longlat +datum=WGS84"))%>%
  as.data.frame()

UTM_32['ID'] = UTM_32_index
UTM_33['ID'] = UTM_33_index

lat_lon = rbind(UTM_32, UTM_33)
names(lat_lon) = c('lon', 'lat', 'ID')

df = left_join(df, lat_lon, 'ID')

#discard old coordinates
df = select(df, -c("GPS_U","E_UTM","N_UTM"))

Checking that everything is fine

ggmap::register_google(key = 'AIzaSyAqqancfrHjBacPzVbnSOpex797_tNgDXI')


p <- ggmap(get_googlemap(center = c(lon = mean(df$lon, na.rm = TRUE), lat = mean(df$lat, na.rm = TRUE)),
                         zoom = 9, scale = 2,
                         maptype ='terrain',
                         color = 'color'))


p + geom_point(aes(x = lon, y = lat), data = df, size = 1) + 
  theme(legend.position="bottom")

it seems that the west part is not over a highway. is this highway under construction? minor road? something is wrong?

Plotting the colors of the samples

#convert rgb to hexadecimal
colores = data.frame('color'=0)

for (i in 1:nrow(df)){
  if (is.na(df$red[i])) {
    colores[i,1] = '#FFFFFF'
  }
  else{
    colores[i,1] = rgb(df$red[i],df$green[i],df$blue[i], maxColorValue=255)
  }
}

#creating a map

ggplot(df, aes(lon, lat))+
  geom_jitter(aes(colour = factor(ID)), show.legend=FALSE)+
  scale_color_manual(values = colores[,1])

looks ok? white dots = no data

Exploratory data analysis

Data structure

plot_intro(df, title = 'DF structure')

Basic statistics

summary(df)
##       id                  L               A                B        
##  Length:145         Min.   :32.79   Min.   : 9.107   Min.   :14.73  
##  Class :character   1st Qu.:41.65   1st Qu.:14.868   1st Qu.:21.40  
##  Mode  :character   Median :44.50   Median :16.610   Median :23.26  
##                     Mean   :44.13   Mean   :16.426   Mean   :23.04  
##                     3rd Qu.:47.04   3rd Qu.:18.358   3rd Qu.:25.13  
##                     Max.   :53.27   Max.   :20.230   Max.   :28.28  
##                     NA's   :2       NA's   :2        NA's   :2      
##        C               H              red            green       
##  Min.   :19.63   Min.   :42.77   Min.   :107.6   Min.   : 66.97  
##  1st Qu.:27.30   1st Qu.:50.19   1st Qu.:134.4   1st Qu.: 86.21  
##  Median :28.76   Median :53.55   Median :142.0   Median : 93.41  
##  Mean   :28.43   Mean   :54.39   Mean   :140.8   Mean   : 93.42  
##  3rd Qu.:30.14   3rd Qu.:58.10   3rd Qu.:148.1   3rd Qu.:101.39  
##  Max.   :32.68   Max.   :69.03   Max.   :163.6   Max.   :118.13  
##  NA's   :2       NA's   :2       NA's   :2       NA's   :2       
##       blue            fed             c_per              Fe        
##  Min.   :52.10   Min.   : 1.984   Min.   :0.1050   Min.   : 42469  
##  1st Qu.:62.05   1st Qu.: 3.957   1st Qu.:0.2182   1st Qu.: 66581  
##  Median :65.41   Median : 4.609   Median :0.3165   Median : 74808  
##  Mean   :66.20   Mean   : 4.862   Mean   :0.3367   Mean   : 76524  
##  3rd Qu.:69.62   3rd Qu.: 5.503   3rd Qu.:0.4412   3rd Qu.: 85667  
##  Max.   :81.00   Max.   :11.100   Max.   :0.9370   Max.   :118069  
##  NA's   :2       NA's   :5        NA's   :5        NA's   :2       
##     FeError             Zr           ZrError        prof_depth   
##  Min.   : 691.4   Min.   :266.5   Min.   :12.15   Min.   : 28.0  
##  1st Qu.:1017.3   1st Qu.:350.0   1st Qu.:13.63   1st Qu.:274.0  
##  Median :1132.7   Median :388.7   Median :14.34   Median :402.0  
##  Mean   :1154.9   Mean   :398.6   Mean   :14.49   Mean   :391.9  
##  3rd Qu.:1274.8   3rd Qu.:442.9   3rd Qu.:15.24   3rd Qu.:495.0  
##  Max.   :1773.9   Max.   :711.0   Max.   :20.25   Max.   :702.0  
##  NA's   :2        NA's   :2       NA's   :2                      
##    samp_depth       weight_nod        weight_qz       geology     slope   
##  Min.   : 18.67   Min.   : 0.0000   Min.   : 0.0000   E   :101   1   :59  
##  1st Qu.: 68.14   1st Qu.: 0.2314   1st Qu.: 0.1580   NA's: 44   2   :35  
##  Median :115.08   Median : 0.8250   Median : 0.3675              3   : 4  
##  Mean   :213.00   Mean   : 2.3159   Mean   : 1.2891              4   :45  
##  3rd Qu.:349.37   3rd Qu.: 2.8611   3rd Qu.: 1.4424              NA's: 2  
##  Max.   :670.94   Max.   :26.8524   Max.   :21.4690                       
##  NA's   :2        NA's   :2         NA's   :2                             
##     horizon          pH        texture      coarse_per       int_per      
##  Bt     :  2   Min.   :3.990   C   :  7   Min.   :20.00   Min.   : 3.000  
##  Bt1    :  2   1st Qu.:4.415   HC  :  5   1st Qu.:30.00   1st Qu.: 7.500  
##  B(t)   :  1   Median :4.620   SC  :  3   Median :35.00   Median : 8.000  
##  B(t)1  :  1   Mean   :4.605   NA's:130   Mean   :36.33   Mean   : 8.667  
##  B(t)7  :  1   3rd Qu.:4.760              3rd Qu.:42.50   3rd Qu.:10.000  
##  (Other):  8   Max.   :5.180              Max.   :58.00   Max.   :19.000  
##  NA's   :130   NA's   :130                NA's   :130     NA's   :130     
##     fine_per          ID           lon             lat       
##  Min.   :39.0   Min.   :  1   Min.   :11.07   Min.   :3.824  
##  1st Qu.:49.5   1st Qu.: 37   1st Qu.:11.32   1st Qu.:3.873  
##  Median :57.0   Median : 73   Median :11.61   Median :3.890  
##  Mean   :55.0   Mean   : 73   Mean   :11.62   Mean   :3.885  
##  3rd Qu.:60.5   3rd Qu.:109   3rd Qu.:11.94   3rd Qu.:3.897  
##  Max.   :69.0   Max.   :145   Max.   :12.34   Max.   :3.922  
##  NA's   :130                  NA's   :8       NA's   :8

Expertise in the area may notice something weird in a variable.

Missing values visualization

plot_missing(df, group = list(Good = 0.05, OK = 0.4, Bad = 1))

Histograms for detecting univariate anomalies

DataExplorer::plot_histogram(df)

Correlation

#create df with numeric variables
df_num = select_if(df, is.numeric)

#only looking on L A and B

df_num_col1=df_num %>%
  select(-c(C:blue))%>% #other colors
  select(-contains('Error'))%>%#measurement errors
  select(-ID)%>% #index
  select(-c("pH", "coarse_per", "int_per", "fine_per"))#high percent of missing values (numeric)

#correlation matrix is calculated
cor_df_num_col1 = round(cor(df_num_col1,
    method = "pearson",
      use = 'pairwise.complete.obs'),3)

#p-values are calculated. CI with 95 confidence
cor_df_num_col1_p = cor.mtest(df_num_col1, conf.level = .95)

Correlogram for better visualization

#Correlogram
col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))

corrplot(cor_df_num_col1, method="color", col=col(200),  
         type="upper", 
         addCoef.col = "dimgrey", number.cex = 1, # Add coefficient of correlation
         tl.col="black", tl.srt=45, tl.cex = 1, #Text label color and rotation and size
         # Combine with significance
         p.mat = cor_df_num_col1_p$p, sig.level = 0.05, insig = "blank", 
         # hide correlation coefficient on the principal diagonal
         diag=F
)

Notice: color = pearson correlation coefficient

numbers = pearson correlation coefficient

blank = significance levels beneath 0.95

The first three rows gives us an idea of variables that correlate with L, A and B: fed, c_per, Fe, depths, weight nods and qz, and longitud . Notice that L and B are higly correlated.

Network plot.

df_num_col1 %>%
  correlate(use = 'complete.obs')%>% 
  network_plot(min_cor = 0.15) # only showing correlations superior to 0.15.

It is easier to see the relationship between variables looking at its closeness and color intensity.

Scatter plots with regression, correlation coeficient and histograms

ggpairs(df_num_col1, lower = list(continuous = wrap("smooth", alpha = 0.3, size=0.1)))

Color experiments

Analizyng colour behaviour

# plot_ly(df, x = ~red, y = ~green, z = ~blue, color = ~ID, colors = colores[,1]) %>%
#   add_markers() %>%
#   layout(scene = list(xaxis = list(title = 'L'),
#                      yaxis = list(title = 'A'),
#                      zaxis = list(title = 'B')))

plot_ly(df, x = ~L, y = ~A, z = ~B, color = ~ID, colors = colores[,1]) %>%
  add_markers() %>%
  layout(scene = list(xaxis = list(title = 'L'),
                     yaxis = list(title = 'A'),
                     zaxis = list(title = 'B')))

It seems to be a grouping somehow. going from clearer to darker. it is not very clear though…. Later I found out that L and B are better represented in this grouping. It might be interesting to try with combinations. For now, we stick with this.

Grouping colors to generate a scale that could be used as a broader classificator.

df_cluster = df %>% 
  select(c('A', 'L', 'B', 'ID'))%>%
  filter(!is.na(df$A))
  

set.seed(100) #seed to attain same results

clusters = kmeans(df_cluster[,1:3],3) #kmeans with 3 groups


df_cluster$c_group = clusters$cluster
df_cluster = select(df_cluster,-c('A', 'L', 'B'))


#transformo la matriz para asignarle la categorĂ­a segĂºn el clĂºster
df_cluster = df_cluster %>%
  mutate(category = as.factor(ifelse(c_group ==1, 'dark',
                            ifelse(c_group ==2, 'light',
                                   ifelse(c_group  == 3, 'medium', NA)))))%>%
  select(-c_group)

df_cluster = left_join(df, df_cluster, by='ID')
df_cluster$category = factor(df_cluster$category)


plot_ly(df_cluster, x = ~L, y = ~A, z = ~B, color = ~ID, colors = colores[,1],symbol = ~category, symbols = c('circle', 'x', 'diamond') ) %>%
  add_markers() %>%
  layout(scene = list(xaxis = list(title = 'L'),
                     yaxis = list(title = 'A'),
                     zaxis = list(title = 'B')))
## Warning: Ignoring 2 observations

Nice visualizations with categories

category=df_cluster$category
df_num_col1_clust = cbind(df_num_col1, category)

ggpairs(df_num_col1_clust,aes(colour = category, alpha = 0.4))

Very interesting graph!!! you can see a lot of nice things in here related to the color grouping. That’s it for now =) espero q te vaya sirviendo