library(tidyverse)
library(readr)
library(dplyr)songs <- read_csv("100 Songs.csv")head(songs)## # A tibble: 6 × 14
## id name durat…¹ energy key loudn…² mode speec…³ acous…⁴ instr…⁵ liven…⁶
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 4ZtF… Good… 2.97 0.664 9 -5.04 1 0.154 0.335 0 0.0849
## 2 5fxy… Stay… 2.3 0.506 8 -11.3 1 0.0589 0.379 8.68e-1 0.11
## 3 5nuj… Levi… 3.38 0.825 6 -3.79 0 0.0601 0.00883 0 0.0674
## 4 4iJy… Peac… 3.3 0.696 0 -6.18 1 0.119 0.321 0 0.42
## 5 1SC5… Mont… 2.3 0.503 8 -6.72 0 0.22 0.293 0 0.405
## 6 3Dar… Kiss… 3.48 0.705 8 -3.46 1 0.0284 0.259 8.92e-5 0.12
## # … with 3 more variables: valence <dbl>, tempo <dbl>, danceability <dbl>, and
## # abbreviated variable names ¹duration, ²loudness, ³speechiness,
## # ⁴acousticness, ⁵instrumentalness, ⁶liveness
library(dplyr)
library(tidyr)
data <- songs %>%
rename(
Acousticness = acousticness,
Danceability = danceability,
Duration = duration,
Energy = energy,
Id = id,
Instrumentalness = instrumentalness,
Key = key,
Liveness = liveness,
Loudness = loudness,
Mode = mode,
Name = name,
Speechiness = speechiness,
Tempo = tempo,
Valence = valence
)sum(is.na(data))## [1] 0
sum(duplicated(data))## [1] 10
Remove the duplicated
data <- unique(data)change the ID value to be NA
data$ Id<- as.numeric(data$Id)Replace the NA values in the Id column with numbers, you can use the if_else() function from dplyr package
library(dplyr)
data <- data %>%
mutate(Id = if_else(is.na(Id), 1:nrow(data), Id))knitr::kable(head(data), align = "c")| Id | Name | Duration | Energy | Key | Loudness | Mode | Speechiness | Acousticness | Instrumentalness | Liveness | Valence | Tempo | Danceability |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | Good 4 U Olivia Rodrigo | 2.97 | 0.664 | 9 | -5.044 | 1 | 0.1540 | 0.33500 | 0.00e+00 | 0.0849 | 0.688 | 166.928 | 0.563 |
| 2 | Stay The Kid LAROI & Justin Bieber | 2.30 | 0.506 | 8 | -11.275 | 1 | 0.0589 | 0.37900 | 8.68e-01 | 0.1100 | 0.454 | 170.054 | 0.564 |
| 3 | Levitating Dua Lipa feat. DaBaby | 3.38 | 0.825 | 6 | -3.787 | 0 | 0.0601 | 0.00883 | 0.00e+00 | 0.0674 | 0.915 | 102.977 | 0.702 |
| 4 | Peaches Justin Bieber feat. Daniel Caesar & Giveon | 3.30 | 0.696 | 0 | -6.181 | 1 | 0.1190 | 0.32100 | 0.00e+00 | 0.4200 | 0.464 | 90.030 | 0.677 |
| 5 | Montero (Call Me By Your Name) Lil Nas X | 2.30 | 0.503 | 8 | -6.725 | 0 | 0.2200 | 0.29300 | 0.00e+00 | 0.4050 | 0.710 | 178.781 | 0.593 |
| 6 | Kiss Me More (feat. SZA) Doja Cat | 3.48 | 0.705 | 8 | -3.463 | 1 | 0.0284 | 0.25900 | 8.92e-05 | 0.1200 | 0.781 | 110.970 | 0.764 |
library(ggplot2)
ggplot(data, aes(x = "", y = Duration)) +
geom_boxplot() +
labs(y = "Duration (s)") +
theme_minimal()hist(data$Loudness, breaks = 20, col = "blue", xlab = "Loudness (dB)")str(data)## tibble [90 × 14] (S3: tbl_df/tbl/data.frame)
## $ Id : num [1:90] 1 2 3 4 5 6 7 8 9 10 ...
## $ Name : chr [1:90] "Good 4 U Olivia Rodrigo" "Stay The Kid LAROI & Justin Bieber" "Levitating Dua Lipa feat. DaBaby" "Peaches Justin Bieber feat. Daniel Caesar & Giveon" ...
## $ Duration : num [1:90] 2.97 2.3 3.38 3.3 2.3 3.48 4.03 4.03 3.33 3.59 ...
## $ Energy : num [1:90] 0.664 0.506 0.825 0.696 0.503 0.705 0.616 0.431 0.73 0.612 ...
## $ Key : num [1:90] 9 8 6 0 8 8 5 10 1 2 ...
## $ Loudness : num [1:90] -5.04 -11.28 -3.79 -6.18 -6.72 ...
## $ Mode : num [1:90] 1 1 0 1 0 1 1 1 1 1 ...
## $ Speechiness : num [1:90] 0.154 0.0589 0.0601 0.119 0.22 0.0284 0.0324 0.0578 0.0598 0.112 ...
## $ Acousticness : num [1:90] 0.335 0.379 0.00883 0.321 0.293 0.259 0.182 0.768 0.00146 0.584 ...
## $ Instrumentalness: num [1:90] 0.00 8.68e-01 0.00 0.00 0.00 8.92e-05 0.00 1.42e-05 9.54e-05 5.70e-06 ...
## $ Liveness : num [1:90] 0.0849 0.11 0.0674 0.42 0.405 0.12 0.0927 0.106 0.0897 0.37 ...
## $ Valence : num [1:90] 0.688 0.454 0.915 0.464 0.71 0.781 0.719 0.137 0.334 0.178 ...
## $ Tempo : num [1:90] 167 170 103 90 179 ...
## $ Danceability : num [1:90] 0.563 0.564 0.702 0.677 0.593 0.764 0.586 0.561 0.514 0.442 ...
Is there a relationship between certain characteristics of the songs, such as energy and danceability?
# Create a scatter plot of energy vs. danceability
ggplot(data, aes(x = Energy, y = Danceability)) +
geom_point() +
labs(x = "Energy", y = "Danceability")# Create a histogram of danceability
ggplot(data, aes(x = Danceability)) +
geom_histogram() +
labs(x = "Danceability")# Create a correlation matrix of all variables
correlation <- cor(data$Energy, data$Danceability)
print(correlation)## [1] 0.3336112
A correlation coefficient of 0.3336112 suggests a weak positive correlation between the two variables being analyzed. This means that there is a tendency for higher values of one variable to be associated with higher values of the other variable, but the relationship is not very strong. In this case, it indicates that there may be some relationship between the energy and danceability of songs, but it is not a strong or definitive relationship.
library(tidyverse)
library(readr)
library(dplyr)pharm<- read_csv("pharma spend by country.csv")head(pharm)## # A tibble: 6 × 7
## LOCATION TIME PC_HEALTHXP PC_GDP USD_CAP FLAG_CODES TOTAL_SPEND
## <chr> <dbl> <dbl> <dbl> <dbl> <chr> <dbl>
## 1 AUS 1971 16.0 0.727 35.7 <NA> 462.
## 2 AUS 1972 15.1 0.686 36.1 <NA> 475.
## 3 AUS 1973 15.1 0.681 39.9 <NA> 533.
## 4 AUS 1974 14.8 0.755 47.6 <NA> 653.
## 5 AUS 1975 11.8 0.682 47.6 <NA> 661.
## 6 AUS 1976 10.9 0.63 46.9 <NA> 658.
groups the data by LOCATION using the group_by() function, and then summarizes the number of observations for each country using the summarize() function with the n() function
pharm %>%
group_by(LOCATION) %>%
summarize(n_obs = n())## # A tibble: 32 × 2
## LOCATION n_obs
## <chr> <int>
## 1 AUS 44
## 2 AUT 26
## 3 BEL 41
## 4 CAN 45
## 5 CHE 31
## 6 CZE 26
## 7 DEU 45
## 8 DNK 36
## 9 ESP 32
## 10 EST 17
## # … with 22 more rows
pharm%>%
group_by(TIME) %>%
summarize(n_obs = n())## # A tibble: 47 × 2
## TIME n_obs
## <dbl> <int>
## 1 1970 11
## 2 1971 11
## 3 1972 11
## 4 1973 11
## 5 1974 11
## 6 1975 13
## 7 1976 13
## 8 1977 13
## 9 1978 13
## 10 1979 13
## # … with 37 more rows
pharm%>%
summarize(n_countries = n_distinct(LOCATION),
n_years = n_distinct(TIME)) %>%
bind_rows(pharm %>%
group_by(LOCATION) %>%
summarize(n_obs = n()) %>%
summarize(n_countries = n_distinct(LOCATION),
n_obs = sum(n_obs)) %>%
select(n_countries, n_obs)) %>%
bind_rows(pharm %>%
group_by(TIME) %>%
summarize(n_obs = n()) %>%
summarize(n_years = n_distinct(TIME),
n_obs = sum(n_obs)) %>%
select(n_years, n_obs))## # A tibble: 3 × 3
## n_countries n_years n_obs
## <int> <int> <int>
## 1 32 47 NA
## 2 32 NA 1000
## 3 NA 47 1000
Some visualizations to explore the data
pharm %>%
group_by(TIME) %>%
summarize(n_obs = n()) %>%
ggplot(aes(x = TIME, y = n_obs)) +
geom_bar(stat = "identity") +
labs(x = "Year", y = "Number of Observations",
title = "Number of Observations by Year")library(tidyverse)
for_label <- pharm %>%
group_by(LOCATION) %>%
summarize(year=max(TIME), totalspend=max(TOTAL_SPEND))
ggplot(data=pharm, aes(x=TIME, y=TOTAL_SPEND, group=LOCATION, color=LOCATION)) +
geom_line(linewidth=1.2) +
geom_text(data=for_label, aes(x=year, y=totalspend + 4000, label=LOCATION), size=3, color="black", fontface="bold") +
scale_color_viridis_d() +
labs(title="Pharmaceutical Spending by Country, 1971-2017",
x="Year", y="Total Pharmaceutical Spending (USD billions)",
color="Country") +
theme_classic() +
theme(plot.title = element_text(face="bold", size=18),
axis.title = element_text(face="bold", size=14),
axis.text = element_text(size=12),
legend.position = "bottom",
legend.title = element_text(face="bold", size=12),
legend.text = element_text(size=10),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()) +
annotate("text", x=1980, y=60000, label="USA has highest spending", size=5, color="black", fontface="bold") +
annotate("segment", x=1980, xend=1990, y=60000, yend=60000, size=1.5, color="black", lineend="butt", arrow=arrow(length=unit(0.3,"cm"))) +
annotate("text", x=1980, y=25000, label="Japan spends second most", size=5, color="black", fontface="bold") +
annotate("segment", x=1980, xend=1990, y=25000, yend=25000, size=1.5, color="black", lineend="butt", arrow=arrow(length=unit(0.3,"cm"))) +
annotate("text", x=1995, y=20000, label="Australia, Canada, France, and Germany have similar spending", size=5, color="black", fontface="bold") +
annotate("segment", x=1995, xend=2005, y=20000, yend=20000, size=1.5, color="black", lineend="butt", arrow=arrow(length=unit(0.3,"cm")))library(dplyr)
summary_data <- pharm %>%
group_by(LOCATION) %>%
summarize(mean_spend = mean(TOTAL_SPEND),
median_spend = median(TOTAL_SPEND),
min_spend = min(TOTAL_SPEND),
max_spend = max(TOTAL_SPEND),
sd_spend = sd(TOTAL_SPEND))
head (summary_data)## # A tibble: 6 × 6
## LOCATION mean_spend median_spend min_spend max_spend sd_spend
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 AUS 4786. 2765. 462. 14504. 4542.
## 2 AUT 3182. 3251. 1167. 5463. 1392.
## 3 BEL 3081. 2063. 405. 7655. 2425.
## 4 CAN 9892. 6974. 736. 27931. 9120.
## 5 CHE 3323. 2594. 1108. 8747. 2289.
## 6 CZE 3019. 3076. 994. 4659. 1159.
It is clear from the plots and the summary that the USA has consistently been the biggest pharmaceutical spender among the countries included in the dataset, with spending levels several times higher than other countries.