library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(readxl)
data <- read.csv("C:/Users/aiden/mergedfile.csv")
summary(data)
## Date Symbol Adj.Close Close
## Length:352097 Length:352097 Min. : 1.03 Min. : 1.03
## Class :character Class :character 1st Qu.: 28.07 1st Qu.: 33.74
## Mode :character Mode :character Median : 52.26 Median : 60.79
## Mean : 105.71 Mean : 113.01
## 3rd Qu.: 102.52 3rd Qu.: 113.34
## Max. :4119.09 Max. :4119.09
## NA's :12562 NA's :12562
## High Low Open Volume
## Min. : 1.26 Min. : 1.01 Min. : 1.03 Min. :0.000e+00
## 1st Qu.: 34.10 1st Qu.: 33.34 1st Qu.: 33.72 1st Qu.:9.424e+05
## Median : 61.41 Median : 60.14 Median : 60.75 Median :2.131e+06
## Mean : 114.22 Mean : 111.75 Mean : 113.00 Mean :9.998e+06
## 3rd Qu.: 114.50 3rd Qu.: 112.10 3rd Qu.: 113.31 3rd Qu.:4.965e+06
## Max. :4144.32 Max. :4110.64 Max. :4117.00 Max. :1.881e+09
## NA's :12562 NA's :12562 NA's :12562 NA's :12562
## Exchange Shortname Longname Sector
## Length:352097 Length:352097 Length:352097 Length:352097
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Industry Currentprice Marketcap Ebitda
## Length:352097 Min. : 10.39 Min. :6.823e+09 Min. :-8.410e+08
## Class :character 1st Qu.: 72.29 1st Qu.:1.748e+10 1st Qu.: 1.568e+09
## Mode :character Median : 130.55 Median :4.050e+10 Median : 2.977e+09
## Mean : 232.74 Mean :1.769e+11 Mean : 1.078e+10
## 3rd Qu.: 227.00 3rd Qu.:1.099e+11 3rd Qu.: 6.499e+09
## Max. :3830.58 Max. :3.449e+12 Max. : 1.318e+11
## NA's :22110
## Revenuegrowth City State Country
## Min. :-0.39700 Length:352097 Length:352097 Length:352097
## 1st Qu.:-0.01600 Class :character Class :character Class :character
## Median : 0.04700 Mode :character Mode :character Mode :character
## Mean : 0.04264
## 3rd Qu.: 0.09600
## Max. : 0.46300
##
## Fulltimeemployees Longbusinesssummary Weight
## Min. : 568 Length:352097 Min. :0.0001304
## 1st Qu.: 9372 Class :character 1st Qu.:0.0003340
## Median : 24150 Mode :character Median :0.0007741
## Mean : 68873 Mean :0.0033815
## 3rd Qu.: 57000 3rd Qu.:0.0021012
## Max. :1525000 Max. :0.0659147
##
For categorical columns, this should include unique values and counts
For numeric columns, this includes min/max, central tendency, and some notion of distribution (e.g., quantiles)
These summaries can be combined
# Create percentage price change column if not already present
data <- data |>
mutate(perc_change = (Close - Open) / Open * 100) # Calculate percentage price change
# Create a summary of volatility and market capitalization by sector
sector_volatility_summary <- data |>
group_by(Sector) |>
summarise(
Avg_Perc_Change_SD = sd(perc_change, na.rm = TRUE), # Stock price volatility (standard deviation)
Avg_Market_Cap = mean(Marketcap, na.rm = TRUE), # Average market capitalization
Stock_Count = n_distinct(Symbol) # Number of unique stocks in the sector
) |>
arrange(desc(Avg_Perc_Change_SD)) # Order by highest volatility
# Display the summary
sector_volatility_summary |> print()
## # A tibble: 11 × 4
## Sector Avg_Perc_Change_SD Avg_Market_Cap Stock_Count
## <chr> <dbl> <dbl> <int>
## 1 Energy 2.27 2.29e10 2
## 2 Industrials 1.92 4.94e10 11
## 3 Consumer Cyclical 1.85 1.62e11 14
## 4 Basic Materials 1.67 3.60e10 2
## 5 Technology 1.66 3.68e11 16
## 6 Healthcare 1.51 8.35e10 14
## 7 Real Estate 1.45 3.68e10 6
## 8 Financial Services 1.40 1.36e11 17
## 9 Consumer Defensive 1.29 3.38e10 5
## 10 Utilities 1.26 2.49e10 6
## 11 Communication Services 1.21 1.41e12 3
Column summaries (i.e., the above bullet)
Data documentation
Your project’s goals/purpose
Answering the column summaries question, in this function I am creating a tibble that displays average volume and 5 year percent change based off of sector
# Assuming merged_file is your data frame and it's loaded correctly
sector_performance <- data |>
mutate(perc_change = (Close - Open) / Open * 100) |>
group_by(Sector) |>
summarise(
avg_volume = mean(Volume, na.rm = TRUE),
avg_perc_change = mean(perc_change, na.rm = TRUE)
)
# Display the summary
print(sector_performance)
## # A tibble: 11 × 3
## Sector avg_volume avg_perc_change
## <chr> <dbl> <dbl>
## 1 Basic Materials 1325863. 0.0172
## 2 Communication Services 50353842. 0.0155
## 3 Consumer Cyclical 9948954. 0.0251
## 4 Consumer Defensive 3584157. 0.0203
## 5 Energy 5552398. -0.0182
## 6 Financial Services 8521951. 0.0280
## 7 Healthcare 4158931. 0.0323
## 8 Industrials 4317506. 0.0201
## 9 Real Estate 1318084. 0.0214
## 10 Technology 23137866. 0.0561
## 11 Utilities 2253649. 0.0266
This should include distributions at least
In addition, you should consider trends, correlations, and interactions between variables
Use different channels (e.g., color) to show how categorical variables interact with continuous variables
# Filter for selected companies
selected_symbols <- c("AAPL", "GOOGL", "MMM") # Adjust this list as needed
filtered_data <- data |>
filter(Symbol %in% selected_symbols)
# Convert Date column to Date type if not already
filtered_data <- filtered_data |>
mutate(Date = as.Date(Date, format = "%Y-%m-%d"))
# Line plot of closing prices over time
ggplot(filtered_data, aes(x = Date, y = Close, color = Symbol, group = Symbol)) +
geom_line() +
labs(title = "Stock Price Trends Over Time for Selected Companies",
x = "Date",
y = "Closing Price") +
theme_minimal() +
theme(legend.position = "bottom")
# Calculate percentage change
merged_file <- data |>
mutate(perc_change = (Close - Open) / Open * 100)
# Scatter plot of trading volume vs percentage change, colored by sector
ggplot(data, aes(x = Volume, y = perc_change, color = Sector)) +
geom_point(alpha = 0.6) +
scale_x_log10() + # Log scale for volume to handle wide range
labs(title = "Interaction between Trading Volume and Percentage Price Change by Sector",
x = "Trading Volume (log scale)",
y = "Percentage Price Change") +
theme_minimal() +
theme(legend.position = "bottom")
## Warning in scale_x_log10(): log-10 transformation introduced infinite values.
## Warning: Removed 12562 rows containing missing values or values outside the scale range
## (`geom_point()`).