# Corrected file path with forward slashes
file_path <- "C:/Users/Gero9877/OneDrive - University of St. Thomas/OPMT 470/Final Project/Data_Without_Top_100_Customers.csv"

# Check if the file exists
if (!file.exists(file_path)) {
  stop("File not found at the specified path.")
}

# Load the CSV file
Data_Without_Top_100_Customers <- read.csv(file_path)

# Load ggplot2 library
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.2
# Preview the data
head(Data_Without_Top_100_Customers)

Load the Data

# Corrected file path with forward slashes
file_path <- "C:/Users/Gero9877/OneDrive - University of St. Thomas/OPMT 470/Final Project/Data_Without_Top_100_Customers.csv"

# Load the CSV file
Data_Without_Top_100_Customers <- read.csv(file_path)

# Load ggplot2 library
library(ggplot2)

# Preview the data
head(Data_Without_Top_100_Customers)

#Distribution by the week on spending

# First plot: Price Distribution by Day of the Week
ggplot(Data_Without_Top_100_Customers, aes(x = weekday, y = price)) +
  geom_bar(stat = "identity", fill = "skyblue", color = "black") +
  labs(title = "Price Distribution by Day of the Week",
       x = "Weekday",
       y = "Price")

distrubution by the three different buckets

# Second plot: Event Type Distribution by Day of the Week
ggplot(Data_Without_Top_100_Customers, aes(x = weekday, fill = event_type)) +
  geom_bar(position = "stack") +
  labs(title = "Event Type Distribution by Day of the Week",
       x = "Weekday",
       y = "Count of Events",
       fill = "Event Type") +
  theme_minimal()

Linear Regression

#Linear Rgression model



selected_vars <- c("price", "category_1", "category_2", "category_3")
Data_Without_Top_100_Customers_subset <- Data_Without_Top_100_Customers[selected_vars]
lr_data <- lm(price ~ ., data = Data_Without_Top_100_Customers_subset)
summary(lr_data)
## 
## Call:
## lm(formula = price ~ ., data = Data_Without_Top_100_Customers_subset)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##   -736    -86    -20     30  64396 
## 
## Coefficients: (17 not defined because of singularities)
##                           Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)                 25.695     14.186    1.811 0.070088 .  
## category_1auto              93.278     59.988    1.555 0.119961    
## category_1computers         84.887     14.727    5.764 8.22e-09 ***
## category_1construction     118.661     21.624    5.487 4.08e-08 ***
## category_1electronics       -3.597     17.488   -0.206 0.837017    
## category_1furniture       -645.289    109.250   -5.907 3.50e-09 ***
## category_1medicine          13.809     18.901    0.731 0.465022    
## category_2audio             92.818     10.428    8.901  < 2e-16 ***
## category_2bathroom         705.414    124.510    5.665 1.47e-08 ***
## category_2bedroom          708.926    128.098    5.534 3.13e-08 ***
## category_2camera           302.284     13.187   22.923  < 2e-16 ***
## category_2components       275.399      4.076   67.572  < 2e-16 ***
## category_2environment      134.557     14.609    9.210  < 2e-16 ***
## category_2kitchen          640.975     45.305   14.148  < 2e-16 ***
## category_2living_room      639.072    110.012    5.809 6.28e-09 ***
## category_2network          164.637      9.032   18.228  < 2e-16 ***
## category_2peripherals           NA         NA       NA       NA    
## category_2personal              NA         NA       NA       NA    
## category_2tools                 NA         NA       NA       NA    
## category_2video                 NA         NA       NA       NA    
## category_3air_heater      -100.598     11.717   -8.586  < 2e-16 ***
## category_3alarm            -37.905     58.594   -0.647 0.517685    
## category_3bath                  NA         NA       NA       NA    
## category_3blanket               NA         NA       NA       NA    
## category_3blender         -579.185     44.190  -13.107  < 2e-16 ***
## category_3camera           -58.854      5.996   -9.815  < 2e-16 ***
## category_3cdrw            -306.248     11.446  -26.755  < 2e-16 ***
## category_3chair             53.597     51.759    1.036 0.300425    
## category_3coffee_grinder  -607.575     48.565  -12.511  < 2e-16 ***
## category_3coffee_machine  -335.425     43.399   -7.729 1.09e-14 ***
## category_3compressor       -17.115     58.662   -0.292 0.770473    
## category_3cooler          -359.450      3.480 -103.284  < 2e-16 ***
## category_3cpu             -194.444      2.333  -83.362  < 2e-16 ***
## category_3dictaphone       -49.262     19.382   -2.542 0.011034 *  
## category_3drill            -43.159     16.700   -2.584 0.009756 ** 
## category_3fan             -131.401     37.660   -3.489 0.000485 ***
## category_3fryer           -603.810     46.786  -12.906  < 2e-16 ***
## category_3generator         50.375     29.497    1.708 0.087671 .  
## category_3gps              -40.882     58.727   -0.696 0.486347    
## category_3grill           -590.532     43.455  -13.589  < 2e-16 ***
## category_3hair_cutter       26.141     16.755    1.560 0.118726    
## category_3hdd             -274.606      3.679  -74.637  < 2e-16 ***
## category_3headphone        -56.335      4.377  -12.870  < 2e-16 ***
## category_3heater           -76.058     34.985   -2.174 0.029706 *  
## category_3immobilizer      -96.295     68.700   -1.402 0.161014    
## category_3joystick          -9.333      5.691   -1.640 0.101032    
## category_3juicer          -587.769     43.707  -13.448  < 2e-16 ***
## category_3kettle          -614.726     43.587  -14.103  < 2e-16 ***
## category_3keyboard         -33.494      7.683   -4.359 1.30e-05 ***
## category_3light           -102.844     17.658   -5.824 5.74e-09 ***
## category_3massager          31.822     17.947    1.773 0.076212 .  
## category_3meat_grinder    -521.713     43.544  -11.981  < 2e-16 ***
## category_3microphone       -74.446      7.132  -10.438  < 2e-16 ***
## category_3microwave       -507.673     43.489  -11.674  < 2e-16 ***
## category_3mixer           -546.208     43.632  -12.518  < 2e-16 ***
## category_3monitor          264.850      5.681   46.623  < 2e-16 ***
## category_3motherboard     -288.363      2.265 -127.305  < 2e-16 ***
## category_3mouse            -69.817      6.468  -10.794  < 2e-16 ***
## category_3music_tools      322.493     16.526   19.514  < 2e-16 ***
## category_3nas              142.213     21.283    6.682 2.36e-11 ***
## category_3network_adapter -345.520      8.753  -39.473  < 2e-16 ***
## category_3painting         -96.124     22.014   -4.367 1.26e-05 ***
## category_3parktronic       -70.091     58.832   -1.191 0.233507    
## category_3photo           -299.473     23.411  -12.792  < 2e-16 ***
## category_3player            19.864     58.341    0.340 0.733500    
## category_3power_supply    -282.979      3.582  -79.006  < 2e-16 ***
## category_3printer           60.925      4.270   14.269  < 2e-16 ***
## category_3projector        724.593     13.215   54.832  < 2e-16 ***
## category_3pump             -50.412     20.759   -2.428 0.015165 *  
## category_3radar            -12.267     59.818   -0.205 0.837517    
## category_3refrigerators   -611.263     96.211   -6.353 2.11e-10 ***
## category_3router                NA         NA       NA       NA    
## category_3saw               49.828     17.232    2.892 0.003832 ** 
## category_3scales                NA         NA       NA       NA    
## category_3scanner          106.830      8.498   12.571  < 2e-16 ***
## category_3screw           -126.065     17.871   -7.054 1.74e-12 ***
## category_3shelving              NA         NA       NA       NA    
## category_3soldering        -70.023     16.701   -4.193 2.76e-05 ***
## category_3sound_card      -290.020      8.678  -33.421  < 2e-16 ***
## category_3steam_cooker    -614.139     54.633  -11.241  < 2e-16 ***
## category_3tonometer             NA         NA       NA       NA    
## category_3toster          -614.684     48.840  -12.586  < 2e-16 ***
## category_3tv               331.961     10.478   31.681  < 2e-16 ***
## category_3tv_remote             NA         NA       NA       NA    
## category_3tv_tuner        -345.003      7.218  -47.800  < 2e-16 ***
## category_3vacuum                NA         NA       NA       NA    
## category_3video                 NA         NA       NA       NA    
## category_3videocards            NA         NA       NA       NA    
## category_3videoregister    -10.030     58.415   -0.172 0.863675    
## category_3vr               -74.122     24.133   -3.071 0.002131 ** 
## category_3washer                NA         NA       NA       NA    
## category_3welding           27.336     17.343    1.576 0.114985    
## category_3wifi                  NA         NA       NA       NA    
## category_3winch            -87.435    148.022   -0.591 0.554732    
## category_3window           -58.413     61.925   -0.943 0.345532    
## category_3wrench                NA         NA       NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 333.3 on 448575 degrees of freedom
##   (435810 observations deleted due to missingness)
## Multiple R-squared:  0.1404, Adjusted R-squared:  0.1402 
## F-statistic: 938.9 on 78 and 448575 DF,  p-value: < 2.2e-16

#This shows the distrubution by hour and by day of activity

# Disable scientific notation for clarity
options(scipen = 999)

# Load necessary library
library(ggplot2)

# Convert weekday to a factor with ordered levels
Data_Without_Top_100_Customers$weekday <- factor(Data_Without_Top_100_Customers$weekday, 
                               levels = c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"), 
                               ordered = TRUE)

# Filter out specific event types (e.g., "view" and "cart")
filtered_data <- Data_Without_Top_100_Customers[!(Data_Without_Top_100_Customers$event_type %in% c("view", "cart")), ]

# Ensure the 'time_hour' column is numeric
filtered_data$time_hour <- as.numeric(filtered_data$time_hour)

# Check if 'filtered_data' has rows after filtering
if (nrow(filtered_data) == 0) {
  stop("Filtered dataset has no rows after filtering event types")
}

# Create the histogram
ggplot(filtered_data, aes(x = time_hour, fill = event_type)) +
  geom_histogram(binwidth = 1, position = "dodge") +
  facet_wrap(~ weekday, ncol = 3) +
  labs(title = "Distribution of Events by Hour, Weekday, and Event Type (Excluding View and Cart)",
       x = "Hour",
       y = "Count",
       fill = "Event Type") +
  theme_minimal()

# Clean the data by removing unnecessary columns
data_cleaned <- Data_Without_Top_100_Customers[, -c(1, 3, 4, 5, 6, 8, 9, 10, 11, 12, 14, 15, 16, 17)]

# Preview the cleaned data
head
## function (x, ...) 
## UseMethod("head")
## <bytecode: 0x0000029304d7a300>
## <environment: namespace:utils>

Mean of price by day

tapply(Data_Without_Top_100_Customers$price, Data_Without_Top_100_Customers$weekday, mean)
##    Sunday    Monday   Tuesday Wednesday  Thursday    Friday  Saturday 
##  147.7764  143.7999  148.8428  144.3208  147.6150  147.5641  144.4111

Mean of price by activity

tapply(Data_Without_Top_100_Customers$price,list(Data_Without_Top_100_Customers$event_type, Data_Without_Top_100_Customers$weekday), mean)
##            Sunday   Monday  Tuesday Wednesday Thursday   Friday Saturday
## cart     164.1900 154.2199 162.7951  152.3394 163.5810 160.6858 160.2764
## purchase 141.9904 135.2100 136.7100  129.9932 138.6087 140.4071 138.6811
## view     146.9437 143.4981 148.4461  144.4641 146.9411 147.0144 143.6201

Boxplot of price by weekday

# Create a boxplot and capture its statistics in 'bp'
bp <- boxplot(Data_Without_Top_100_Customers$price ~ Data_Without_Top_100_Customers$weekday, 
              xlab = NA,                # No x-axis label
              ylab = "Price",           # Y-axis label
              main = "Boxplot of Price by Weekday", # Title
              outline = FALSE)          # Remove outliers

# Add labels for mean values (3rd quartile in boxplot stats)
text(x = 1:length(bp$stats[1, ]),       # X-coordinates for each box
     y = bp$stats[3, ],                 # Y-coordinates (median or 3rd quartile)
     labels = round(bp$stats[3, ], 2),  # Rounded mean values
     pos = 3,                           # Position above the box
     cex = 0.7,                         # Text size
     col = "blue")                      # Text color

This plot shows the frequency of event by day

# Load necessary library
library(ggplot2)

# Create a barplot using base R
barplot(table(Data_Without_Top_100_Customers$event_type, Data_Without_Top_100_Customers$weekday), 
        beside = TRUE, 
        legend.text = TRUE,
        xlab = "Weekday", 
        ylab = "Frequency", 
        main = "Distribution of Event Types Across Weekdays")

# Create a stacked bar chart using ggplot2
ggplot(Data_Without_Top_100_Customers, aes(x = weekday, fill = event_type)) +
  geom_bar(position = "stack") +
  labs(title = "Event Type Distribution by Day of the Week",
       x = "Weekday",
       y = "Count of Events",
       fill = "Event Type") +
  theme_minimal()