library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(readxl)
data <- read.csv("C:/Users/aiden/mergedfile.csv")
summary(data)
##      Date              Symbol            Adj.Close           Close        
##  Length:352097      Length:352097      Min.   :   1.03   Min.   :   1.03  
##  Class :character   Class :character   1st Qu.:  28.07   1st Qu.:  33.74  
##  Mode  :character   Mode  :character   Median :  52.26   Median :  60.79  
##                                        Mean   : 105.71   Mean   : 113.01  
##                                        3rd Qu.: 102.52   3rd Qu.: 113.34  
##                                        Max.   :4119.09   Max.   :4119.09  
##                                        NA's   :12562     NA's   :12562    
##       High              Low               Open             Volume         
##  Min.   :   1.26   Min.   :   1.01   Min.   :   1.03   Min.   :0.000e+00  
##  1st Qu.:  34.10   1st Qu.:  33.34   1st Qu.:  33.72   1st Qu.:9.424e+05  
##  Median :  61.41   Median :  60.14   Median :  60.75   Median :2.131e+06  
##  Mean   : 114.22   Mean   : 111.75   Mean   : 113.00   Mean   :9.998e+06  
##  3rd Qu.: 114.50   3rd Qu.: 112.10   3rd Qu.: 113.31   3rd Qu.:4.965e+06  
##  Max.   :4144.32   Max.   :4110.64   Max.   :4117.00   Max.   :1.881e+09  
##  NA's   :12562     NA's   :12562     NA's   :12562     NA's   :12562      
##    Exchange          Shortname           Longname            Sector         
##  Length:352097      Length:352097      Length:352097      Length:352097     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##    Industry          Currentprice       Marketcap             Ebitda          
##  Length:352097      Min.   :  10.39   Min.   :6.823e+09   Min.   :-8.410e+08  
##  Class :character   1st Qu.:  72.29   1st Qu.:1.748e+10   1st Qu.: 1.568e+09  
##  Mode  :character   Median : 130.55   Median :4.050e+10   Median : 2.977e+09  
##                     Mean   : 232.74   Mean   :1.769e+11   Mean   : 1.078e+10  
##                     3rd Qu.: 227.00   3rd Qu.:1.099e+11   3rd Qu.: 6.499e+09  
##                     Max.   :3830.58   Max.   :3.449e+12   Max.   : 1.318e+11  
##                                                           NA's   :22110       
##  Revenuegrowth          City              State             Country         
##  Min.   :-0.39700   Length:352097      Length:352097      Length:352097     
##  1st Qu.:-0.01600   Class :character   Class :character   Class :character  
##  Median : 0.04700   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 0.04264                                                           
##  3rd Qu.: 0.09600                                                           
##  Max.   : 0.46300                                                           
##                                                                             
##  Fulltimeemployees Longbusinesssummary     Weight         
##  Min.   :    568   Length:352097       Min.   :0.0001304  
##  1st Qu.:   9372   Class :character    1st Qu.:0.0003340  
##  Median :  24150   Mode  :character    Median :0.0007741  
##  Mean   :  68873                       Mean   :0.0033815  
##  3rd Qu.:  57000                       3rd Qu.:0.0021012  
##  Max.   :1525000                       Max.   :0.0659147  
## 

A numeric summary of data for at least 2 columns of data

# Create percentage price change column if not already present
data <- data |>
  mutate(perc_change = (Close - Open) / Open * 100)  # Calculate percentage price change

# Create a summary of volatility and market capitalization by sector
sector_volatility_summary <- data |>
  group_by(Sector) |>
  summarise(
    Avg_Perc_Change_SD = sd(perc_change, na.rm = TRUE),  # Stock price volatility (standard deviation)
    Avg_Market_Cap = mean(Marketcap, na.rm = TRUE),  # Average market capitalization
    Stock_Count = n_distinct(Symbol)  # Number of unique stocks in the sector
  ) |>
  arrange(desc(Avg_Perc_Change_SD))  # Order by highest volatility

# Display the summary
sector_volatility_summary |> print()
## # A tibble: 11 × 4
##    Sector                 Avg_Perc_Change_SD Avg_Market_Cap Stock_Count
##    <chr>                               <dbl>          <dbl>       <int>
##  1 Energy                               2.27        2.29e10           2
##  2 Industrials                          1.92        4.94e10          11
##  3 Consumer Cyclical                    1.85        1.62e11          14
##  4 Basic Materials                      1.67        3.60e10           2
##  5 Technology                           1.66        3.68e11          16
##  6 Healthcare                           1.51        8.35e10          14
##  7 Real Estate                          1.45        3.68e10           6
##  8 Financial Services                   1.40        1.36e11          17
##  9 Consumer Defensive                   1.29        3.38e10           5
## 10 Utilities                            1.26        2.49e10           6
## 11 Communication Services               1.21        1.41e12           3

A set of at least 3 novel questions to investigate informed by the following

Address at least one of the above questions using an aggregation function

Answering the column summaries question, in this function I am creating a tibble that displays average volume and 5 year percent change based off of sector

# Assuming merged_file is your data frame and it's loaded correctly
sector_performance <- data |>
  mutate(perc_change = (Close - Open) / Open * 100) |>
  group_by(Sector) |>
  summarise(
    avg_volume = mean(Volume, na.rm = TRUE),
    avg_perc_change = mean(perc_change, na.rm = TRUE)
  )

# Display the summary
print(sector_performance)
## # A tibble: 11 × 3
##    Sector                 avg_volume avg_perc_change
##    <chr>                       <dbl>           <dbl>
##  1 Basic Materials          1325863.          0.0172
##  2 Communication Services  50353842.          0.0155
##  3 Consumer Cyclical        9948954.          0.0251
##  4 Consumer Defensive       3584157.          0.0203
##  5 Energy                   5552398.         -0.0182
##  6 Financial Services       8521951.          0.0280
##  7 Healthcare               4158931.          0.0323
##  8 Industrials              4317506.          0.0201
##  9 Real Estate              1318084.          0.0214
## 10 Technology              23137866.          0.0561
## 11 Utilities                2253649.          0.0266

Visual summaries (i.e., visualizations) of 2 or more columns of your data

# Filter for selected companies
selected_symbols <- c("AAPL", "GOOGL", "MMM")  # Adjust this list as needed
filtered_data <- data |>
  filter(Symbol %in% selected_symbols)

# Convert Date column to Date type if not already
filtered_data <- filtered_data |>
  mutate(Date = as.Date(Date, format = "%Y-%m-%d"))

# Line plot of closing prices over time
ggplot(filtered_data, aes(x = Date, y = Close, color = Symbol, group = Symbol)) +
  geom_line() +
  labs(title = "Stock Price Trends Over Time for Selected Companies",
       x = "Date",
       y = "Closing Price") +
  theme_minimal() +
  theme(legend.position = "bottom")

# Calculate percentage change
merged_file <- data |> 
  mutate(perc_change = (Close - Open) / Open * 100)

# Scatter plot of trading volume vs percentage change, colored by sector
ggplot(data, aes(x = Volume, y = perc_change, color = Sector)) +
  geom_point(alpha = 0.6) +
  scale_x_log10() +  # Log scale for volume to handle wide range
  labs(title = "Interaction between Trading Volume and Percentage Price Change by Sector",
       x = "Trading Volume (log scale)",
       y = "Percentage Price Change") +
  theme_minimal() +
  theme(legend.position = "bottom")
## Warning in scale_x_log10(): log-10 transformation introduced infinite values.
## Warning: Removed 12562 rows containing missing values or values outside the scale range
## (`geom_point()`).