library(psych)
library(corrplot)
## corrplot 0.95 loaded
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
library(data.table)
library(GGally)
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
library(ggplot2)
library(tidyr)
spotify <- read.csv("~/Documents/R Class 5/SpotifyTop2018_40_V2-2.csv")
summary(spotify)
## X danceability energy loudness
## Length:40 Min. :0.2580 Min. :0.3910 Min. :-9.211
## Class :character 1st Qu.:0.6805 1st Qu.:0.5643 1st Qu.:-7.077
## Mode :character Median :0.7500 Median :0.6590 Median :-5.930
## Mean :0.7192 Mean :0.6619 Mean :-5.846
## 3rd Qu.:0.8175 3rd Qu.:0.7725 3rd Qu.:-4.624
## Max. :0.9220 Max. :0.9090 Max. :-3.093
## speechiness acousticness instrumentalness liveness
## Min. :0.02320 Min. :0.000282 Min. :0.000e+00 Min. :0.02150
## 1st Qu.:0.04955 1st Qu.:0.030800 1st Qu.:0.000e+00 1st Qu.:0.09527
## Median :0.10550 Median :0.121500 Median :3.220e-06 Median :0.11200
## Mean :0.12129 Mean :0.184621 Mean :3.695e-03 Mean :0.17713
## 3rd Qu.:0.14450 3rd Qu.:0.227500 3rd Qu.:4.325e-05 3rd Qu.:0.29475
## Max. :0.51600 Max. :0.847000 Max. :1.340e-01 Max. :0.55200
## valence tempo duration_ms time_signature
## Min. :0.0967 Min. : 77.17 Min. : 95467 Min. :3.00
## 1st Qu.:0.2875 1st Qu.: 95.04 1st Qu.:189856 1st Qu.:4.00
## Median :0.4375 Median :122.53 Median :212904 Median :4.00
## Mean :0.4679 Mean :122.11 Mean :205646 Mean :3.95
## 3rd Qu.:0.6332 3rd Qu.:140.59 3rd Qu.:226983 3rd Qu.:4.00
## Max. :0.9310 Max. :191.70 Max. :268867 Max. :4.00
describe(spotify)
## vars n mean sd median trimmed mad
## X* 1 40 20.50 11.69 20.50 20.50 14.83
## danceability 2 40 0.72 0.15 0.75 0.74 0.10
## energy 3 40 0.66 0.14 0.66 0.67 0.16
## loudness 4 40 -5.85 1.61 -5.93 -5.83 1.82
## speechiness 5 40 0.12 0.09 0.11 0.11 0.07
## acousticness 6 40 0.18 0.21 0.12 0.15 0.14
## instrumentalness 7 40 0.00 0.02 0.00 0.00 0.00
## liveness 8 40 0.18 0.13 0.11 0.16 0.06
## valence 9 40 0.47 0.23 0.44 0.46 0.25
## tempo 10 40 122.11 31.18 122.53 119.11 39.39
## duration_ms 11 40 205645.70 34324.03 212904.50 209182.22 28128.63
## time_signature 12 40 3.95 0.22 4.00 4.00 0.00
## min max range skew kurtosis se
## X* 1.00 40.00 39.00 0.00 -1.29 1.85
## danceability 0.26 0.92 0.66 -1.25 1.55 0.02
## energy 0.39 0.91 0.52 -0.21 -1.02 0.02
## loudness -9.21 -3.09 6.12 -0.09 -0.84 0.25
## speechiness 0.02 0.52 0.49 1.97 5.24 0.01
## acousticness 0.00 0.85 0.85 1.38 1.13 0.03
## instrumentalness 0.00 0.13 0.13 5.78 32.48 0.00
## liveness 0.02 0.55 0.53 1.13 0.13 0.02
## valence 0.10 0.93 0.83 0.20 -0.92 0.04
## tempo 77.17 191.70 114.53 0.62 -0.76 4.93
## duration_ms 95467.00 268867.00 173400.00 -1.13 1.88 5427.11
## time_signature 3.00 4.00 1.00 -3.98 14.16 0.03
# Correlation matrix
num <- spotify[sapply(spotify, is.numeric)]
cor_mat <- cor(num, use = 'pairwise.complete.obs')
print(cor_mat)
## danceability energy loudness speechiness acousticness
## danceability 1.00000000 -0.052973197 -0.04332003 0.20739685 0.016291189
## energy -0.05297320 1.000000000 0.66563377 -0.03522523 -0.269420435
## loudness -0.04332003 0.665633766 1.00000000 -0.28162075 -0.081193798
## speechiness 0.20739685 -0.035225228 -0.28162075 1.00000000 0.097140342
## acousticness 0.01629119 -0.269420435 -0.08119380 0.09714034 1.000000000
## instrumentalness -0.10219848 0.159487395 0.07978047 -0.13197148 -0.150741663
## liveness -0.03139211 -0.082066080 -0.05014529 -0.22559158 -0.155929897
## valence 0.47064850 0.465741856 0.40227345 0.05961193 0.174967390
## tempo -0.30106712 0.005003471 -0.03443782 0.01845286 -0.064331831
## duration_ms -0.30530339 -0.035214824 0.01191078 -0.14474048 -0.297226375
## time_signature 0.26882122 0.234110003 -0.03255661 0.22488415 -0.006533519
## instrumentalness liveness valence tempo
## danceability -0.10219848 -0.03139211 0.47064850 -0.301067115
## energy 0.15948739 -0.08206608 0.46574186 0.005003471
## loudness 0.07978047 -0.05014529 0.40227345 -0.034437816
## speechiness -0.13197148 -0.22559158 0.05961193 0.018452865
## acousticness -0.15074166 -0.15592990 0.17496739 -0.064331831
## instrumentalness 1.00000000 -0.04609379 -0.13053402 0.254821754
## liveness -0.04609379 1.00000000 -0.10839165 -0.281270059
## valence -0.13053402 -0.10839165 1.00000000 -0.240701777
## tempo 0.25482175 -0.28127006 -0.24070178 1.000000000
## duration_ms -0.07217052 -0.11584968 -0.27330405 -0.174676210
## time_signature 0.04043786 -0.09059303 0.30294237 0.056347037
## duration_ms time_signature
## danceability -0.30530339 0.268821223
## energy -0.03521482 0.234110003
## loudness 0.01191078 -0.032556614
## speechiness -0.14474048 0.224884152
## acousticness -0.29722638 -0.006533519
## instrumentalness -0.07217052 0.040437860
## liveness -0.11584968 -0.090593026
## valence -0.27330405 0.302942371
## tempo -0.17467621 0.056347037
## duration_ms 1.00000000 -0.272388752
## time_signature -0.27238875 1.000000000
corrplot(cor_mat, method = "color", type = "upper",
tl.cex = 0.8, tl.col = "black", addCoef.col = "black")

#Scatterplot
ggplot(spotify, aes(x = energy, y = loudness)) +
geom_point(color = "blue", alpha = 0.8) +
geom_smooth(method = "lm", color = "red", se = FALSE) +
labs(title = "Energy vs Loudness",
x = "Energy",
y = "Loudness")
## `geom_smooth()` using formula = 'y ~ x'

ggplot(spotify, aes(x = valence, y = danceability)) +
geom_point(color = "blue", alpha = 0.8) +
geom_smooth(method = "lm", color = "red", se = FALSE) +
labs(title = "Danceability vs Valence",
x = "Valence",
y = "Danceability")
## `geom_smooth()` using formula = 'y ~ x'

ggplot(spotify, aes(x = energy, y = valence)) +
geom_point(color = "blue", alpha = 0.8) +
geom_smooth(method = "lm", color = "red", se = FALSE) +
labs(title = "Energy vs Valence",
x = "Energy",
y = "Danceability")
## `geom_smooth()` using formula = 'y ~ x'

ggplot(spotify, aes(x = valence, y = loudness)) +
geom_point(color = "blue", alpha = 0.8) +
geom_smooth(method = "lm", color = "red", se = FALSE) +
labs(title = "Loudness vs Valence",
x = "Valence",
y = "Loudness")
## `geom_smooth()` using formula = 'y ~ x'

#Boxplot
for (col in names(num)) {
p <- ggplot(spotify, aes(y = .data[[col]])) +
geom_boxplot(fill = "lightblue", outlier.colour = "red") +
labs(title = paste("Boxplot of", col), y = col)
print(p)
}











# What are the most relevant associations between variables?
# En el data set
## Titanic
titanic <- read.csv("~/Documents/R Class 5/titanicV2020-2.csv")
# Eliminate all unnecessary variables.
titanic <- titanic[, !(names(titanic) %in% c("PassengerId","Name", "Ticket", "Cabin","SibSp", "Parch"))]
# Not all columns are numeric. You will need to do some cleanup.
titanic$Survived <- factor(titanic$Survived, levels = c(0,1), labels = c("No","Yes"))
titanic$Sex_num <- ifelse(titanic$Sex == "female", 1, 0)
# Embarked: asignar números a categorÃas
titanic$Embarked_num <- as.numeric(factor(titanic$Embarked))
titanic$Pclass <- as.numeric(titanic$Pclass)
# Remove all NAs (or impute them by using the mean)
titanic$Age[is.na(titanic$Age)] <- mean(titanic$Age, na.rm = TRUE)
titanic$Age <- round(titanic$Age, 2)
# Recode important columns
dim(titanic)
## [1] 1309 8
# What are the dimensions or the dataset?
# Carry out a numerical summary
summary(titanic)
## Survived Pclass Sex Age Fare
## No :815 Min. :1.000 Length:1309 Min. : 0.17 Min. : 0.000
## Yes:494 1st Qu.:2.000 Class :character 1st Qu.:22.00 1st Qu.: 7.896
## Median :3.000 Mode :character Median :29.88 Median : 14.454
## Mean :2.295 Mean :29.88 Mean : 33.295
## 3rd Qu.:3.000 3rd Qu.:35.00 3rd Qu.: 31.275
## Max. :3.000 Max. :80.00 Max. :512.329
## NA's :1
## Embarked Sex_num Embarked_num
## Length:1309 Min. :0.000 Min. :1.000
## Class :character 1st Qu.:0.000 1st Qu.:2.000
## Mode :character Median :0.000 Median :3.000
## Mean :0.356 Mean :2.493
## 3rd Qu.:1.000 3rd Qu.:3.000
## Max. :1.000 Max. :3.000
## NA's :2
# Calculate the correlation matrix. Create a heat map. Identify correlated variables
tit_num <- titanic[sapply(titanic, is.numeric)]
cor_mat <- cor(tit_num, use = 'pairwise.complete.obs')
print(cor_mat)
## Pclass Age Fare Sex_num Embarked_num
## Pclass 1.0000000 -0.36637817 -0.5586287 -0.12461672 0.18734921
## Age -0.3663782 1.00000000 0.1718976 -0.05739490 -0.07287063
## Fare -0.5586287 0.17189755 1.0000000 0.18552298 -0.23908624
## Sex_num -0.1246167 -0.05739490 0.1855230 1.00000000 -0.09940791
## Embarked_num 0.1873492 -0.07287063 -0.2390862 -0.09940791 1.00000000
corrplot(cor_mat, method = "color", type = "upper",
tl.cex = 0.8, tl.col = "black", addCoef.col = "black")

# Create scatterplots for all variable pairs.
ggplot(titanic, aes(x = Age, y = Fare)) +
geom_point(color = "blue", alpha = 0.8) +
geom_smooth(method = "lm", color = "red", se = FALSE) +
labs(title = "Age vs Fare",
x = "Pclass",
y = "Fare")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

ggplot(titanic, aes(x = factor(Sex_num, labels = c("Male", "Female")), y = Fare)) +
geom_boxplot(fill = "lightblue", alpha = 0.7) +
labs(title = "Fare distribution by Sex",
x = "Sex",
y = "Fare")
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_boxplot()`).
