setwd(“~/Desktop/intro to ds/asm”) install.packages(“tinytex”) — title: “Assignment 1: Introduction to Data Science” author: “UvA Student” output: pdf_document —
set.seed(225)
Data = data.frame(x.n = rnorm(50000), x.p = rlnorm(50000, meanlog = 0, sdlog = 1))
p1 <- ggplot(Data, aes(x = x.n)) +
geom_histogram(bins = 50, fill = "skyblue", color = "black") +
ggtitle("Histogram of x.n")
p2 <- ggplot(Data, aes(y = x.n)) +
geom_boxplot(fill = "orange") +
ggtitle("Boxplot of x.n")
grid.arrange(p1, p2, nrow = 1)
mean_xn <- mean(Data$x.n)
sd_xn <- sd(Data$x.n)
mean_xn
## [1] -0.003239091
sd_xn
## [1] 1.002094
The data x.n is generated using a standard normal distribution. The sample mean and standard deviation are close to 0 and 1, respectively, as expected.
The sample mean and standard deviation summarize the center and spread. For a normal distribution, this is meaningful. Since the normal distribution has thin tails, extreme values are rare, and predictions using the mean are generally reliable.
mean_xp <- mean(Data$x.p)
sd_xp <- sd(Data$x.p)
mean_xp
## [1] 1.647422
sd_xp
## [1] 2.135505
p3 <- ggplot(Data, aes(x = x.p)) +
geom_histogram(bins = 50, fill = "purple", color = "black") +
xlim(0, 10) +
ggtitle("Histogram of x.p (Log-normal, cut off at 10)")
p4 <- ggplot(Data, aes(y = x.p)) +
geom_boxplot(fill = "pink") +
coord_cartesian(ylim = c(0, 10)) +
ggtitle("Boxplot of x.p (cut off at 10)")
grid.arrange(p3, p4, nrow = 1)
## Warning: Removed 547 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).
The mean and SD are finite but heavily influenced by outliers. The heavy tail of the Pareto distribution means extremely large values occur with non-negligible probability. Thus, the mean is a poor predictor for new observations.
car_data <- read.csv("Car_data.csv", na.strings = c("NA", "?"))
car_data_clean <- car_data %>% filter(!is.na(price))
sum(is.na(car_data$price))
## [1] 4
ggplot(car_data_clean, aes(x = price)) +
geom_histogram(bins = 50, fill = "steelblue", color = "black") +
ggtitle("Histogram of Car Prices")
vars <- c("curb.weight", "engine.size", "horsepower", "highway.mpg")
for (v in vars) {
p <- ggplot(car_data_clean, aes_string(x = v, y = "price")) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm", se = FALSE, col = "red") +
ggtitle(paste("Price vs", v))
print(p)
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).
## `geom_smooth()` using formula = 'y ~ x'
We observe strong positive relationships between price and variables like engine size, horsepower, and curb weight. There is a negative correlation with highway mpg, suggesting more efficient cars tend to be cheaper.
pc_vars <- car_data_clean %>% select(curb.weight, engine.size, horsepower, highway.mpg) %>% na.omit()
pca_result <- prcomp(pc_vars, scale. = TRUE)
summary(pca_result)
## Importance of components:
## PC1 PC2 PC3 PC4
## Standard deviation 1.8318 0.57125 0.48861 0.28209
## Proportion of Variance 0.8388 0.08158 0.05969 0.01989
## Cumulative Proportion 0.8388 0.92042 0.98011 1.00000
pca_result$rotation
## PC1 PC2 PC3 PC4
## curb.weight 0.5073415 -0.18580359 -0.6571533 -0.5255770
## engine.size 0.4999754 -0.63617369 0.1031988 0.5784960
## horsepower 0.5045853 0.09956758 0.7262252 -0.4561544
## highway.mpg -0.4878759 -0.74219024 0.1734832 -0.4254813
PC1: Weighted average of all features. PC2: contrasts mpg vs. horsepower/weight. PC3: captures residual structure.
pc_scores <- as.data.frame(pca_result$x)
pc_scores$price <- car_data_clean$price[as.numeric(rownames(pc_scores))]
for (i in 1:3) {
p <- ggplot(pc_scores, aes_string(x = paste0("PC", i), y = "price")) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm", se = FALSE, col = "darkgreen") +
ggtitle(paste("Price vs PC", i))
print(p)
}
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
Principal components are predictive of price. PC1 has a strong positive correlation with price, capturing the combined effect of car size, power, and efficiency.
tinytex::install_tinytex()