data <- read_csv("synthetic_stock_data (1).csv")
unique_companies <- length(unique(data$Company))
unique_sectors <- length(unique(data$Sector))
cat("Number of unique companies:", unique_companies, "\n")
## Number of unique companies: 63
cat("Number of unique sectors:", unique_sectors, "\n")
## Number of unique sectors: 7
observations <- nrow(data)
variables <- ncol(data)
paste("Number of observations:", observations)
## [1] "Number of observations: 1000"
paste("Number of variables:", variables)
## [1] "Number of variables: 14"
data$Date <- as.Date(data$Date, format = "%d-%m-%Y")
range(data$Date)
## [1] "2022-01-01" "2024-09-26"
colSums(is.na(data))
## Date Company Sector Open High
## 0 0 0 0 0
## Low Close Volume Market_Cap PE_Ratio
## 0 0 0 2 0
## Dividend_Yield Volatility Sentiment_Score Trend
## 0 0 0 0
data_2023 <- subset(data, format(Date, "%Y") == "2023")
head(data_2023)
## # A tibble: 6 × 14
## Date Company Sector Open High Low Close Volume Market_Cap PE_Ratio
## <date> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2023-01-01 Meta Finan… 108. 111. 105. 108. 116617 2.03e 9 9.46
## 2 2023-01-02 Roche Techn… 111. 112. 108. 111. 66896 1.60e11 29.8
## 3 2023-01-03 Apple Consu… 111. 113. 109. 111. 180798 4.69e11 39.0
## 4 2023-01-04 Morgan S… Autom… 113. 115. 111. 113. 188877 9.44e11 33.9
## 5 2023-01-05 ExxonMob… Techn… 114. 116. 111. 114. 55237 1.03e11 29.5
## 6 2023-01-06 Roche Consu… 114. 116. 110. 114. 70056 8.54e11 34.4
## # ℹ 4 more variables: Dividend_Yield <dbl>, Volatility <dbl>,
## # Sentiment_Score <dbl>, Trend <chr>
highest_close <- max(data$Close, na.rm = TRUE)
row <- data[data$Close == highest_close, ]
highest_close
## [1] 128.9932
row$Company
## [1] "Chevron"
row$Date
## [1] "2024-08-08"
close_higher_than_open <- subset(data, Close > Open)
head(close_higher_than_open)
## # A tibble: 0 × 14
## # ℹ 14 variables: Date <date>, Company <chr>, Sector <chr>, Open <dbl>,
## # High <dbl>, Low <dbl>, Close <dbl>, Volume <dbl>, Market_Cap <dbl>,
## # PE_Ratio <dbl>, Dividend_Yield <dbl>, Volatility <dbl>,
## # Sentiment_Score <dbl>, Trend <chr>
max_row <- data[which.max(data$Volume), ]
cat("Highest trading volume:", max_row$Volume, "\n")
## Highest trading volume: 199849
cat("Date:", max_row$Date, "\n")
## Date: 19702
cat("Company:", max_row$Company, "\n")
## Company: Gilead Sciences
avg_close <- aggregate(Close ~ Sector, data, mean)
avg_close[which.max(avg_close$Close), ]
## Sector Close
## 5 Finance 110.7605
total_volume <- aggregate(Volume ~ Company, data, sum)
total_volume[which.max(total_volume$Volume), ]
## Company Volume
## 35 Moderna 3256043
avg_dividend <- aggregate(Dividend_Yield ~ Sector, data, mean)
avg_dividend[which.max(avg_dividend$Dividend_Yield), ]
## Sector Dividend_Yield
## 1 Aerospace 2.691776
data$Month <- format(data$Date, "%Y-%m")
avg_close_month <- aggregate(Close ~ Month, data, mean)
head(avg_close_month)
## Month Close
## 1 2022-01 99.43570
## 2 2022-02 95.84786
## 3 2022-03 95.07988
## 4 2022-04 93.24878
## 5 2022-05 95.51409
## 6 2022-06 95.21263
avg_pe <- aggregate(PE_Ratio ~ Company, data, mean)
ranked_pe <- avg_pe[order(-avg_pe$PE_Ratio), ]
head(ranked_pe)
## Company PE_Ratio
## 63 Wells Fargo 28.55813
## 4 Amazon 27.68080
## 3 Alibaba 27.33441
## 45 Porsche 27.29486
## 46 Raytheon 27.26369
## 61 Uber 27.10058
avg_cap <- aggregate(Market_Cap ~ Company, data, mean, na.rm = TRUE)
ranked_cap <- avg_cap[order(avg_cap$Market_Cap), ]
head(ranked_cap)
## Company Market_Cap
## 15 Ferrari 309180398477
## 62 Visa 325109600294
## 49 Samsung 365106595885
## 6 Apple 389545920851
## 33 Meta 399994003159
## 14 ExxonMobil 410736615385
bullish_data <- subset(data, Trend == "Bullish")
bullish_count <- sort(table(bullish_data$Sector), decreasing = TRUE)
head(bullish_count, 5)
##
## Healthcare Aerospace Consumer Goods Finance Energy
## 53 49 49 49 47
data$Valuation <- ifelse(data$PE_Ratio < 15, "Undervalued",
ifelse(data$PE_Ratio <= 25, "Fair", "Overvalued"))
head(data[, c("Company", "PE_Ratio", "Valuation")])
## # A tibble: 6 × 3
## Company PE_Ratio Valuation
## <chr> <dbl> <chr>
## 1 Uber 24.3 Fair
## 2 Tesla 18.6 Fair
## 3 Panasonic 10.7 Undervalued
## 4 Tencent 14.6 Undervalued
## 5 Wells Fargo 37.5 Overvalued
## 6 Snapchat 28.7 Overvalued
data$Dividend_Safety <- ifelse(data$Dividend_Yield < 4, "Safe", "Risky")
head(data[, c("Company", "Dividend_Yield", "Dividend_Safety")])
## # A tibble: 6 × 3
## Company Dividend_Yield Dividend_Safety
## <chr> <dbl> <chr>
## 1 Uber 0.163 Safe
## 2 Tesla 0.289 Safe
## 3 Panasonic 2.22 Safe
## 4 Tencent 1.38 Safe
## 5 Wells Fargo 3.11 Safe
## 6 Snapchat 4.96 Risky
#This analysis explores stock market data, examining relationships between opening and closing prices, market capitalization, trading volume, and sector performance. The analysis includes visualizations, regression models, and statistical tests to uncover patterns in stock behavior.
#Bar plot showing distribution of companies across sectors # This visualization shows which sectors have the most companies represented in the dataset. Financial or Technology sectors often dominate such analyses.
ggplot(data, aes(x = Sector, fill = Sector)) +
geom_bar() +
labs(title = "Number of Companies by Sector", x = "Sector", y = "Count") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
#Build linear model predicting Close price from Open price #This shows the strong relationship between opening and closing prices. The high R-squared value would indicate that opening price is a good predictor of closing price.
model_slr <- lm(Close ~ Open, data = data[!is.na(data$Market_Cap), ])
summary(model_slr)
##
## Call:
## lm(formula = Close ~ Open, data = data[!is.na(data$Market_Cap),
## ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.246e-14 -1.260e-15 -8.500e-16 -4.000e-16 9.843e-13
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.936e-13 9.595e-15 3.059e+01 <2e-16 ***
## Open 1.000e+00 8.663e-17 1.154e+16 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.124e-14 on 996 degrees of freedom
## Multiple R-squared: 1, Adjusted R-squared: 1
## F-statistic: 1.332e+32 on 1 and 996 DF, p-value: < 2.2e-16
ggplot(data, aes(x = Open, y = Close)) +
geom_point(alpha = 0.6) +
geom_smooth(method = "lm", se = FALSE, color = "red") +
labs(title = "SLR: Close vs Open", x = "Open Price", y = "Close Price")
#Build polynomial regression model (degree 2) for Close price vs Market Cap #This examines whether larger companies (by market cap) have different closing price patterns. The polynomial term helps capture any non-linear relationships.
model_poly <- lm(Close ~ poly(as.numeric(Market_Cap), 2), data = data[!is.na(data$Market_Cap), ])
summary(model_poly)
##
## Call:
## lm(formula = Close ~ poly(as.numeric(Market_Cap), 2), data = data[!is.na(data$Market_Cap),
## ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -27.968 -10.087 3.540 8.621 19.939
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 110.1701 0.3615 304.784 <2e-16 ***
## poly(as.numeric(Market_Cap), 2)1 -15.8150 11.4192 -1.385 0.166
## poly(as.numeric(Market_Cap), 2)2 -3.9626 11.4192 -0.347 0.729
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.42 on 995 degrees of freedom
## Multiple R-squared: 0.002045, Adjusted R-squared: 3.86e-05
## F-statistic: 1.019 on 2 and 995 DF, p-value: 0.3612
ggplot(data, aes(x = Market_Cap, y = Close)) +
geom_point(alpha = 0.5) +
stat_smooth(method = "lm", formula = y ~ poly(x, 2), color = "blue") +
labs(title = "Polynomial Regression", x = "Market Cap", y = "Close")
#Shows the frequency distribution of stock prices. Most stocks likely cluster at lower price points with a long right tail of higher-priced stocks.
ggplot(data, aes(x = Close)) +
geom_histogram(bins = 30, fill = "blue", alpha = 0.7) +
labs(title = "Distribution of Closing Prices", x = "Closing Price", y = "Frequency")
#Quantifies the strength of relationship between opening and closing prices. A correlation near 1 would indicate very strong linear relationship.
cor_open_close <- cor(data$Open, data$Close, use = "complete.obs")
cat("Correlation between Open and Close:", cor_open_close, "\n")
## Correlation between Open and Close: 1
ggplot(data, aes(x = Open, y = Close)) +
geom_point(color = "steelblue", alpha = 0.5) +
geom_smooth(method = "lm", se = FALSE, color = "darkred") +
labs(title = paste("Open vs Close (Correlation =", round(cor_open_close, 2), ")"),
x = "Open", y = "Close")
#Reveals which sectors tend to have higher/lower priced stocks and the variability within each sector. Technology and healthcare often show wider ranges.
ggplot(data, aes(x = Sector, y = Close, fill = Sector)) +
geom_boxplot() +
labs(title = "Close Price by Sector", x = "Sector", y = "Close") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
#Calculate and visualize average trading volume by month #Shows seasonal patterns in trading activity. Typically higher volumes are seen at quarter ends or during earnings seasons.
monthly_volume <- data %>%
group_by(Month) %>%
summarize(Avg_Volume = mean(Volume, na.rm = TRUE))
ggplot(monthly_volume, aes(x = Month, y = Avg_Volume, group = 1)) +
geom_line(color = "purple") +
geom_point(color = "purple") +
labs(title = "Average Trading Volume by Month", x = "Month", y = "Avg Volume") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
#Analysis of Variance to test if sector affects closing price #Determines whether sector membership significantly affects closing prices. A significant p-value would suggest sector does matter.
anova_model <- aov(Close ~ Sector, data = data[!is.na(data$Market_Cap), ])
summary(anova_model)
## Df Sum Sq Mean Sq F value Pr(>F)
## Sector 6 395 65.91 0.504 0.806
## Residuals 991 129617 130.79