Exploratory-data-hw.R

library(psych)
library(corrplot)

## corrplot 0.95 loaded

library(car)

## Loading required package: carData

## 
## Attaching package: 'car'

## The following object is masked from 'package:psych':
## 
##     logit

library(data.table)
library(GGally)

## Loading required package: ggplot2

## 
## Attaching package: 'ggplot2'

## The following objects are masked from 'package:psych':
## 
##     %+%, alpha

library(ggplot2)
library(tidyr)
spotify <- read.csv("~/Documents/R Class 5/SpotifyTop2018_40_V2-2.csv")
summary(spotify)

##       X              danceability        energy          loudness     
##  Length:40          Min.   :0.2580   Min.   :0.3910   Min.   :-9.211  
##  Class :character   1st Qu.:0.6805   1st Qu.:0.5643   1st Qu.:-7.077  
##  Mode  :character   Median :0.7500   Median :0.6590   Median :-5.930  
##                     Mean   :0.7192   Mean   :0.6619   Mean   :-5.846  
##                     3rd Qu.:0.8175   3rd Qu.:0.7725   3rd Qu.:-4.624  
##                     Max.   :0.9220   Max.   :0.9090   Max.   :-3.093  
##   speechiness       acousticness      instrumentalness       liveness      
##  Min.   :0.02320   Min.   :0.000282   Min.   :0.000e+00   Min.   :0.02150  
##  1st Qu.:0.04955   1st Qu.:0.030800   1st Qu.:0.000e+00   1st Qu.:0.09527  
##  Median :0.10550   Median :0.121500   Median :3.220e-06   Median :0.11200  
##  Mean   :0.12129   Mean   :0.184621   Mean   :3.695e-03   Mean   :0.17713  
##  3rd Qu.:0.14450   3rd Qu.:0.227500   3rd Qu.:4.325e-05   3rd Qu.:0.29475  
##  Max.   :0.51600   Max.   :0.847000   Max.   :1.340e-01   Max.   :0.55200  
##     valence           tempo         duration_ms     time_signature
##  Min.   :0.0967   Min.   : 77.17   Min.   : 95467   Min.   :3.00  
##  1st Qu.:0.2875   1st Qu.: 95.04   1st Qu.:189856   1st Qu.:4.00  
##  Median :0.4375   Median :122.53   Median :212904   Median :4.00  
##  Mean   :0.4679   Mean   :122.11   Mean   :205646   Mean   :3.95  
##  3rd Qu.:0.6332   3rd Qu.:140.59   3rd Qu.:226983   3rd Qu.:4.00  
##  Max.   :0.9310   Max.   :191.70   Max.   :268867   Max.   :4.00

describe(spotify)

##                  vars  n      mean       sd    median   trimmed      mad
## X*                  1 40     20.50    11.69     20.50     20.50    14.83
## danceability        2 40      0.72     0.15      0.75      0.74     0.10
## energy              3 40      0.66     0.14      0.66      0.67     0.16
## loudness            4 40     -5.85     1.61     -5.93     -5.83     1.82
## speechiness         5 40      0.12     0.09      0.11      0.11     0.07
## acousticness        6 40      0.18     0.21      0.12      0.15     0.14
## instrumentalness    7 40      0.00     0.02      0.00      0.00     0.00
## liveness            8 40      0.18     0.13      0.11      0.16     0.06
## valence             9 40      0.47     0.23      0.44      0.46     0.25
## tempo              10 40    122.11    31.18    122.53    119.11    39.39
## duration_ms        11 40 205645.70 34324.03 212904.50 209182.22 28128.63
## time_signature     12 40      3.95     0.22      4.00      4.00     0.00
##                       min       max     range  skew kurtosis      se
## X*                   1.00     40.00     39.00  0.00    -1.29    1.85
## danceability         0.26      0.92      0.66 -1.25     1.55    0.02
## energy               0.39      0.91      0.52 -0.21    -1.02    0.02
## loudness            -9.21     -3.09      6.12 -0.09    -0.84    0.25
## speechiness          0.02      0.52      0.49  1.97     5.24    0.01
## acousticness         0.00      0.85      0.85  1.38     1.13    0.03
## instrumentalness     0.00      0.13      0.13  5.78    32.48    0.00
## liveness             0.02      0.55      0.53  1.13     0.13    0.02
## valence              0.10      0.93      0.83  0.20    -0.92    0.04
## tempo               77.17    191.70    114.53  0.62    -0.76    4.93
## duration_ms      95467.00 268867.00 173400.00 -1.13     1.88 5427.11
## time_signature       3.00      4.00      1.00 -3.98    14.16    0.03

# Correlation matrix
num <- spotify[sapply(spotify, is.numeric)]
cor_mat <- cor(num, use = 'pairwise.complete.obs')
print(cor_mat)

##                  danceability       energy    loudness speechiness acousticness
## danceability       1.00000000 -0.052973197 -0.04332003  0.20739685  0.016291189
## energy            -0.05297320  1.000000000  0.66563377 -0.03522523 -0.269420435
## loudness          -0.04332003  0.665633766  1.00000000 -0.28162075 -0.081193798
## speechiness        0.20739685 -0.035225228 -0.28162075  1.00000000  0.097140342
## acousticness       0.01629119 -0.269420435 -0.08119380  0.09714034  1.000000000
## instrumentalness  -0.10219848  0.159487395  0.07978047 -0.13197148 -0.150741663
## liveness          -0.03139211 -0.082066080 -0.05014529 -0.22559158 -0.155929897
## valence            0.47064850  0.465741856  0.40227345  0.05961193  0.174967390
## tempo             -0.30106712  0.005003471 -0.03443782  0.01845286 -0.064331831
## duration_ms       -0.30530339 -0.035214824  0.01191078 -0.14474048 -0.297226375
## time_signature     0.26882122  0.234110003 -0.03255661  0.22488415 -0.006533519
##                  instrumentalness    liveness     valence        tempo
## danceability          -0.10219848 -0.03139211  0.47064850 -0.301067115
## energy                 0.15948739 -0.08206608  0.46574186  0.005003471
## loudness               0.07978047 -0.05014529  0.40227345 -0.034437816
## speechiness           -0.13197148 -0.22559158  0.05961193  0.018452865
## acousticness          -0.15074166 -0.15592990  0.17496739 -0.064331831
## instrumentalness       1.00000000 -0.04609379 -0.13053402  0.254821754
## liveness              -0.04609379  1.00000000 -0.10839165 -0.281270059
## valence               -0.13053402 -0.10839165  1.00000000 -0.240701777
## tempo                  0.25482175 -0.28127006 -0.24070178  1.000000000
## duration_ms           -0.07217052 -0.11584968 -0.27330405 -0.174676210
## time_signature         0.04043786 -0.09059303  0.30294237  0.056347037
##                  duration_ms time_signature
## danceability     -0.30530339    0.268821223
## energy           -0.03521482    0.234110003
## loudness          0.01191078   -0.032556614
## speechiness      -0.14474048    0.224884152
## acousticness     -0.29722638   -0.006533519
## instrumentalness -0.07217052    0.040437860
## liveness         -0.11584968   -0.090593026
## valence          -0.27330405    0.302942371
## tempo            -0.17467621    0.056347037
## duration_ms       1.00000000   -0.272388752
## time_signature   -0.27238875    1.000000000

corrplot(cor_mat, method = "color", type = "upper",
         tl.cex = 0.8, tl.col = "black", addCoef.col = "black")

#Scatterplot
ggplot(spotify, aes(x = energy, y = loudness)) +
  geom_point(color = "blue", alpha = 0.8) +
  geom_smooth(method = "lm", color = "red", se = FALSE) +
  labs(title = "Energy vs Loudness",
       x = "Energy",
       y = "Loudness")

## `geom_smooth()` using formula = 'y ~ x'

ggplot(spotify, aes(x = valence, y = danceability)) +
  geom_point(color = "blue", alpha = 0.8) +
  geom_smooth(method = "lm", color = "red", se = FALSE) +
  labs(title = "Danceability vs Valence",
       x = "Valence",
       y = "Danceability")

## `geom_smooth()` using formula = 'y ~ x'

ggplot(spotify, aes(x = energy, y = valence)) +
  geom_point(color = "blue", alpha = 0.8) +
  geom_smooth(method = "lm", color = "red", se = FALSE) +
  labs(title = "Energy vs Valence",
       x = "Energy",
       y = "Danceability")

## `geom_smooth()` using formula = 'y ~ x'

ggplot(spotify, aes(x = valence, y = loudness)) +
  geom_point(color = "blue", alpha = 0.8) +
  geom_smooth(method = "lm", color = "red", se = FALSE) +
  labs(title = "Loudness vs Valence",
       x = "Valence",
       y = "Loudness")

## `geom_smooth()` using formula = 'y ~ x'

#Boxplot

for (col in names(num)) {
  p <- ggplot(spotify, aes(y = .data[[col]])) +
    geom_boxplot(fill = "lightblue", outlier.colour = "red") +
    labs(title = paste("Boxplot of", col), y = col)
  print(p)
}

# What are the most relevant associations between variables?
# En el data set

## Titanic

titanic <- read.csv("~/Documents/R Class 5/titanicV2020-2.csv")  

# Eliminate all unnecessary variables.

titanic <- titanic[, !(names(titanic) %in% c("PassengerId","Name", "Ticket", "Cabin","SibSp", "Parch"))]

# Not all columns are numeric. You will need to do some cleanup.

titanic$Survived <- factor(titanic$Survived, levels = c(0,1), labels = c("No","Yes"))
titanic$Sex_num <- ifelse(titanic$Sex == "female", 1, 0)
# Embarked: asignar números a categorías
titanic$Embarked_num <- as.numeric(factor(titanic$Embarked))
titanic$Pclass <- as.numeric(titanic$Pclass)

# Remove all NAs (or impute them by using the mean)

titanic$Age[is.na(titanic$Age)] <- mean(titanic$Age, na.rm = TRUE)
titanic$Age <- round(titanic$Age, 2)

# Recode important columns

dim(titanic)

## [1] 1309    8

# What are the dimensions or the dataset?



# Carry out a numerical summary

summary(titanic)

##  Survived      Pclass          Sex                 Age             Fare        
##  No :815   Min.   :1.000   Length:1309        Min.   : 0.17   Min.   :  0.000  
##  Yes:494   1st Qu.:2.000   Class :character   1st Qu.:22.00   1st Qu.:  7.896  
##            Median :3.000   Mode  :character   Median :29.88   Median : 14.454  
##            Mean   :2.295                      Mean   :29.88   Mean   : 33.295  
##            3rd Qu.:3.000                      3rd Qu.:35.00   3rd Qu.: 31.275  
##            Max.   :3.000                      Max.   :80.00   Max.   :512.329  
##                                                               NA's   :1        
##    Embarked            Sex_num       Embarked_num  
##  Length:1309        Min.   :0.000   Min.   :1.000  
##  Class :character   1st Qu.:0.000   1st Qu.:2.000  
##  Mode  :character   Median :0.000   Median :3.000  
##                     Mean   :0.356   Mean   :2.493  
##                     3rd Qu.:1.000   3rd Qu.:3.000  
##                     Max.   :1.000   Max.   :3.000  
##                                     NA's   :2

# Calculate the correlation matrix. Create a heat map. Identify correlated variables

tit_num <- titanic[sapply(titanic, is.numeric)]
cor_mat <- cor(tit_num, use = 'pairwise.complete.obs')
print(cor_mat)

##                  Pclass         Age       Fare     Sex_num Embarked_num
## Pclass        1.0000000 -0.36637817 -0.5586287 -0.12461672   0.18734921
## Age          -0.3663782  1.00000000  0.1718976 -0.05739490  -0.07287063
## Fare         -0.5586287  0.17189755  1.0000000  0.18552298  -0.23908624
## Sex_num      -0.1246167 -0.05739490  0.1855230  1.00000000  -0.09940791
## Embarked_num  0.1873492 -0.07287063 -0.2390862 -0.09940791   1.00000000

corrplot(cor_mat, method = "color", type = "upper",
         tl.cex = 0.8, tl.col = "black", addCoef.col = "black")

# Create scatterplots for all variable pairs.

ggplot(titanic, aes(x = Age, y = Fare)) +
  geom_point(color = "blue", alpha = 0.8) +
  geom_smooth(method = "lm", color = "red", se = FALSE) +
  labs(title = "Age vs Fare",
       x = "Pclass",
       y = "Fare")

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_smooth()`).

## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

ggplot(titanic, aes(x = factor(Sex_num, labels = c("Male", "Female")), y = Fare)) +
  geom_boxplot(fill = "lightblue", alpha = 0.7) +
  labs(title = "Fare distribution by Sex",
       x = "Sex",
       y = "Fare")

## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_boxplot()`).

Exploratory-data-hw.R

emiliano

2025-09-01