Load Dataset

data <- read_csv("synthetic_stock_data (1).csv")

Level 1 - Basic Exploration

Unique Companies and Sectors

unique_companies <- length(unique(data$Company))
unique_sectors <- length(unique(data$Sector))
cat("Number of unique companies:", unique_companies, "\n")

## Number of unique companies: 63

cat("Number of unique sectors:", unique_sectors, "\n")

## Number of unique sectors: 7

Observations and Variables

observations <- nrow(data)
variables <- ncol(data)
paste("Number of observations:", observations)

## [1] "Number of observations: 1000"

paste("Number of variables:", variables)

## [1] "Number of variables: 14"

Date Range

data$Date <- as.Date(data$Date, format = "%d-%m-%Y")
range(data$Date)

## [1] "2022-01-01" "2024-09-26"

Missing Values

colSums(is.na(data))

##            Date         Company          Sector            Open            High 
##               0               0               0               0               0 
##             Low           Close          Volume      Market_Cap        PE_Ratio 
##               0               0               0               2               0 
##  Dividend_Yield      Volatility Sentiment_Score           Trend 
##               0               0               0               0

Level 2 - Intermediate Analysis

Records from 2023

data_2023 <- subset(data, format(Date, "%Y") == "2023")
head(data_2023)

## # A tibble: 6 × 14
##   Date       Company   Sector  Open  High   Low Close Volume Market_Cap PE_Ratio
##   <date>     <chr>     <chr>  <dbl> <dbl> <dbl> <dbl>  <dbl>      <dbl>    <dbl>
## 1 2023-01-01 Meta      Finan…  108.  111.  105.  108. 116617    2.03e 9     9.46
## 2 2023-01-02 Roche     Techn…  111.  112.  108.  111.  66896    1.60e11    29.8 
## 3 2023-01-03 Apple     Consu…  111.  113.  109.  111. 180798    4.69e11    39.0 
## 4 2023-01-04 Morgan S… Autom…  113.  115.  111.  113. 188877    9.44e11    33.9 
## 5 2023-01-05 ExxonMob… Techn…  114.  116.  111.  114.  55237    1.03e11    29.5 
## 6 2023-01-06 Roche     Consu…  114.  116.  110.  114.  70056    8.54e11    34.4 
## # ℹ 4 more variables: Dividend_Yield <dbl>, Volatility <dbl>,
## #   Sentiment_Score <dbl>, Trend <chr>

Highest Closing Price

highest_close <- max(data$Close, na.rm = TRUE)
row <- data[data$Close == highest_close, ]
highest_close

## [1] 128.9932

row$Company

## [1] "Chevron"

row$Date

## [1] "2024-08-08"

Close > Open

close_higher_than_open <- subset(data, Close > Open)
head(close_higher_than_open)

## # A tibble: 0 × 14
## # ℹ 14 variables: Date <date>, Company <chr>, Sector <chr>, Open <dbl>,
## #   High <dbl>, Low <dbl>, Close <dbl>, Volume <dbl>, Market_Cap <dbl>,
## #   PE_Ratio <dbl>, Dividend_Yield <dbl>, Volatility <dbl>,
## #   Sentiment_Score <dbl>, Trend <chr>

Highest Trading Volume

max_row <- data[which.max(data$Volume), ]
cat("Highest trading volume:", max_row$Volume, "\n")

## Highest trading volume: 199849

cat("Date:", max_row$Date, "\n")

## Date: 19702

cat("Company:", max_row$Company, "\n")

## Company: Gilead Sciences

Level 3 - Aggregated Analysis

Sector with Highest Avg Close

avg_close <- aggregate(Close ~ Sector, data, mean)
avg_close[which.max(avg_close$Close), ]

##    Sector    Close
## 5 Finance 110.7605

Company with Highest Volume

total_volume <- aggregate(Volume ~ Company, data, sum)
total_volume[which.max(total_volume$Volume), ]

##    Company  Volume
## 35 Moderna 3256043

Sector with Highest Avg Dividend Yield

avg_dividend <- aggregate(Dividend_Yield ~ Sector, data, mean)
avg_dividend[which.max(avg_dividend$Dividend_Yield), ]

##      Sector Dividend_Yield
## 1 Aerospace       2.691776

Monthly Avg Close

data$Month <- format(data$Date, "%Y-%m")
avg_close_month <- aggregate(Close ~ Month, data, mean)
head(avg_close_month)

##     Month    Close
## 1 2022-01 99.43570
## 2 2022-02 95.84786
## 3 2022-03 95.07988
## 4 2022-04 93.24878
## 5 2022-05 95.51409
## 6 2022-06 95.21263

Level 4 - Rankings

PE Ratio Ranking

avg_pe <- aggregate(PE_Ratio ~ Company, data, mean)
ranked_pe <- avg_pe[order(-avg_pe$PE_Ratio), ]
head(ranked_pe)

##        Company PE_Ratio
## 63 Wells Fargo 28.55813
## 4       Amazon 27.68080
## 3      Alibaba 27.33441
## 45     Porsche 27.29486
## 46    Raytheon 27.26369
## 61        Uber 27.10058

Market Cap Ranking

avg_cap <- aggregate(Market_Cap ~ Company, data, mean, na.rm = TRUE)
ranked_cap <- avg_cap[order(avg_cap$Market_Cap), ]
head(ranked_cap)

##       Company   Market_Cap
## 15    Ferrari 309180398477
## 62       Visa 325109600294
## 49    Samsung 365106595885
## 6       Apple 389545920851
## 33       Meta 399994003159
## 14 ExxonMobil 410736615385

Top 5 Bullish Sectors

bullish_data <- subset(data, Trend == "Bullish")
bullish_count <- sort(table(bullish_data$Sector), decreasing = TRUE)
head(bullish_count, 5)

## 
##     Healthcare      Aerospace Consumer Goods        Finance         Energy 
##             53             49             49             49             47

Level 5 - Derived Columns

Valuation Based on PE Ratio

data$Valuation <- ifelse(data$PE_Ratio < 15, "Undervalued",
                         ifelse(data$PE_Ratio <= 25, "Fair", "Overvalued"))
head(data[, c("Company", "PE_Ratio", "Valuation")])

## # A tibble: 6 × 3
##   Company     PE_Ratio Valuation  
##   <chr>          <dbl> <chr>      
## 1 Uber            24.3 Fair       
## 2 Tesla           18.6 Fair       
## 3 Panasonic       10.7 Undervalued
## 4 Tencent         14.6 Undervalued
## 5 Wells Fargo     37.5 Overvalued 
## 6 Snapchat        28.7 Overvalued

Dividend Safety

data$Dividend_Safety <- ifelse(data$Dividend_Yield < 4, "Safe", "Risky")
head(data[, c("Company", "Dividend_Yield", "Dividend_Safety")])

## # A tibble: 6 × 3
##   Company     Dividend_Yield Dividend_Safety
##   <chr>                <dbl> <chr>          
## 1 Uber                 0.163 Safe           
## 2 Tesla                0.289 Safe           
## 3 Panasonic            2.22  Safe           
## 4 Tencent              1.38  Safe           
## 5 Wells Fargo          3.11  Safe           
## 6 Snapchat             4.96  Risky

Data Visualizations

#This analysis explores stock market data, examining relationships between opening and closing prices, market capitalization, trading volume, and sector performance. The analysis includes visualizations, regression models, and statistical tests to uncover patterns in stock behavior.

Companies by Sector

#Bar plot showing distribution of companies across sectors # This visualization shows which sectors have the most companies represented in the dataset. Financial or Technology sectors often dominate such analyses.

ggplot(data, aes(x = Sector, fill = Sector)) +
  geom_bar() +
  labs(title = "Number of Companies by Sector", x = "Sector", y = "Count") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Simple Linear Regression: Close ~ Open

#Build linear model predicting Close price from Open price #This shows the strong relationship between opening and closing prices. The high R-squared value would indicate that opening price is a good predictor of closing price.

model_slr <- lm(Close ~ Open, data = data[!is.na(data$Market_Cap), ])
summary(model_slr)

## 
## Call:
## lm(formula = Close ~ Open, data = data[!is.na(data$Market_Cap), 
##     ])
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -3.246e-14 -1.260e-15 -8.500e-16 -4.000e-16  9.843e-13 
## 
## Coefficients:
##              Estimate Std. Error   t value Pr(>|t|)    
## (Intercept) 2.936e-13  9.595e-15 3.059e+01   <2e-16 ***
## Open        1.000e+00  8.663e-17 1.154e+16   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.124e-14 on 996 degrees of freedom
## Multiple R-squared:      1,  Adjusted R-squared:      1 
## F-statistic: 1.332e+32 on 1 and 996 DF,  p-value: < 2.2e-16

ggplot(data, aes(x = Open, y = Close)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", se = FALSE, color = "red") +
  labs(title = "SLR: Close vs Open", x = "Open Price", y = "Close Price")

Polynomial Regression: Close ~ Market Cap

#Build polynomial regression model (degree 2) for Close price vs Market Cap #This examines whether larger companies (by market cap) have different closing price patterns. The polynomial term helps capture any non-linear relationships.

model_poly <- lm(Close ~ poly(as.numeric(Market_Cap), 2), data = data[!is.na(data$Market_Cap), ])
summary(model_poly)

## 
## Call:
## lm(formula = Close ~ poly(as.numeric(Market_Cap), 2), data = data[!is.na(data$Market_Cap), 
##     ])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -27.968 -10.087   3.540   8.621  19.939 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      110.1701     0.3615 304.784   <2e-16 ***
## poly(as.numeric(Market_Cap), 2)1 -15.8150    11.4192  -1.385    0.166    
## poly(as.numeric(Market_Cap), 2)2  -3.9626    11.4192  -0.347    0.729    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.42 on 995 degrees of freedom
## Multiple R-squared:  0.002045,   Adjusted R-squared:  3.86e-05 
## F-statistic: 1.019 on 2 and 995 DF,  p-value: 0.3612

ggplot(data, aes(x = Market_Cap, y = Close)) +
  geom_point(alpha = 0.5) +
  stat_smooth(method = "lm", formula = y ~ poly(x, 2), color = "blue") +
  labs(title = "Polynomial Regression", x = "Market Cap", y = "Close")

Histogram of Closing Prices

Distribution of closing prices across all stocks

#Shows the frequency distribution of stock prices. Most stocks likely cluster at lower price points with a long right tail of higher-priced stocks.

ggplot(data, aes(x = Close)) +
  geom_histogram(bins = 30, fill = "blue", alpha = 0.7) +
  labs(title = "Distribution of Closing Prices", x = "Closing Price", y = "Frequency")

Correlation: Open vs Close

Calculate correlation coefficient between Open and Close prices

#Quantifies the strength of relationship between opening and closing prices. A correlation near 1 would indicate very strong linear relationship.

cor_open_close <- cor(data$Open, data$Close, use = "complete.obs")
cat("Correlation between Open and Close:", cor_open_close, "\n")

## Correlation between Open and Close: 1

ggplot(data, aes(x = Open, y = Close)) +
  geom_point(color = "steelblue", alpha = 0.5) +
  geom_smooth(method = "lm", se = FALSE, color = "darkred") +
  labs(title = paste("Open vs Close (Correlation =", round(cor_open_close, 2), ")"),
       x = "Open", y = "Close")

Boxplot: Close by Sector

Compare closing price distributions across sectors

#Reveals which sectors tend to have higher/lower priced stocks and the variability within each sector. Technology and healthcare often show wider ranges.

ggplot(data, aes(x = Sector, y = Close, fill = Sector)) +
  geom_boxplot() +
  labs(title = "Close Price by Sector", x = "Sector", y = "Close") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Monthly Avg Volume

#Calculate and visualize average trading volume by month #Shows seasonal patterns in trading activity. Typically higher volumes are seen at quarter ends or during earnings seasons.

monthly_volume <- data %>%
  group_by(Month) %>%
  summarize(Avg_Volume = mean(Volume, na.rm = TRUE))

ggplot(monthly_volume, aes(x = Month, y = Avg_Volume, group = 1)) +
  geom_line(color = "purple") +
  geom_point(color = "purple") +
  labs(title = "Average Trading Volume by Month", x = "Month", y = "Avg Volume") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Advanced Analysis

ANOVA: Close Price by Sector

#Analysis of Variance to test if sector affects closing price #Determines whether sector membership significantly affects closing prices. A significant p-value would suggest sector does matter.

anova_model <- aov(Close ~ Sector, data = data[!is.na(data$Market_Cap), ])
summary(anova_model)

##              Df Sum Sq Mean Sq F value Pr(>F)
## Sector        6    395   65.91   0.504  0.806
## Residuals   991 129617  130.79

Stock Data Analysis with R

Vishal Shivnani, Aditya Raj Chaudhary

2025-05-01