data <- read.csv("D:/2024-2025/FA24/Stats/mergedfile.csv")
colnames(data)
##  [1] "Date"                "Symbol"              "Adj.Close"          
##  [4] "Close"               "High"                "Low"                
##  [7] "Open"                "Volume"              "Exchange"           
## [10] "Shortname"           "Longname"            "Sector"             
## [13] "Industry"            "Currentprice"        "Marketcap"          
## [16] "Ebitda"              "Revenuegrowth"       "City"               
## [19] "State"               "Country"             "Fulltimeemployees"  
## [22] "Longbusinesssummary" "Weight"

Question 1

What are the three unclear columns/values in your data?

Question 2

What is still unclear even after reading documentation.

One main issue that I could not resolve after doing a deeper dive into the documentation was the “RevenueGrowth” column. The issue that I see with this column is we have no frame of reference. On what scale is the growth on? It could be twice a year, every quarter, once a year, etc…

Plots

# Load necessary libraries
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Calculate the average revenue growth for each company
sample_data <- data |>
  filter(Symbol %in% c('AAPL', 'CAT', 'CDW', 'BLK', 'GOOGL')) |>
  group_by(Symbol, Sector) |>
  summarize(AverageRevenueGrowth = mean(Revenuegrowth, na.rm = TRUE))
## `summarise()` has grouped output by 'Symbol'. You can override using the
## `.groups` argument.
# Create the bar plot for average revenue growth
ggplot(sample_data, aes(x = Symbol, y = AverageRevenueGrowth, fill = Sector)) +
  geom_bar(stat = 'identity', width = 0.6) +
  
  
  # Add titles and labels
  labs(
    title = "Revenue Growth with Ambiguous Timeframe",
    x = "Company",
    y = "Revenue Growth (%)"
  ) +
  
  # Customize plot theme
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5, face = "bold"))

# These approaches address the issue of summing repeated values by either averaging or selecting a representative value.

To highlight the issue with the unclear time frame of the “RevenueGrowth” column, we can create a visualization that compares the revenue growth of several companies from different sectors. The goal is to show that without knowing whether the growth rate is quarterly, yearly, or over some other period, it’s difficult to draw meaningful conclusions.