# Corrected file path with forward slashes
file_path <- "C:/Users/Gero9877/OneDrive - University of St. Thomas/OPMT 470/Final Project/Data_Without_Top_100_Customers.csv"
# Check if the file exists
if (!file.exists(file_path)) {
stop("File not found at the specified path.")
}
# Load the CSV file
Data_Without_Top_100_Customers <- read.csv(file_path)
# Load ggplot2 library
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.2
# Preview the data
head(Data_Without_Top_100_Customers)
# Corrected file path with forward slashes
file_path <- "C:/Users/Gero9877/OneDrive - University of St. Thomas/OPMT 470/Final Project/Data_Without_Top_100_Customers.csv"
# Load the CSV file
Data_Without_Top_100_Customers <- read.csv(file_path)
# Load ggplot2 library
library(ggplot2)
# Preview the data
head(Data_Without_Top_100_Customers)
#Distribution by the week on spending
# First plot: Price Distribution by Day of the Week
ggplot(Data_Without_Top_100_Customers, aes(x = weekday, y = price)) +
geom_bar(stat = "identity", fill = "skyblue", color = "black") +
labs(title = "Price Distribution by Day of the Week",
x = "Weekday",
y = "Price")
# Second plot: Event Type Distribution by Day of the Week
ggplot(Data_Without_Top_100_Customers, aes(x = weekday, fill = event_type)) +
geom_bar(position = "stack") +
labs(title = "Event Type Distribution by Day of the Week",
x = "Weekday",
y = "Count of Events",
fill = "Event Type") +
theme_minimal()
#Linear Rgression model
selected_vars <- c("price", "category_1", "category_2", "category_3")
Data_Without_Top_100_Customers_subset <- Data_Without_Top_100_Customers[selected_vars]
lr_data <- lm(price ~ ., data = Data_Without_Top_100_Customers_subset)
summary(lr_data)
##
## Call:
## lm(formula = price ~ ., data = Data_Without_Top_100_Customers_subset)
##
## Residuals:
## Min 1Q Median 3Q Max
## -736 -86 -20 30 64396
##
## Coefficients: (17 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 25.695 14.186 1.811 0.070088 .
## category_1auto 93.278 59.988 1.555 0.119961
## category_1computers 84.887 14.727 5.764 8.22e-09 ***
## category_1construction 118.661 21.624 5.487 4.08e-08 ***
## category_1electronics -3.597 17.488 -0.206 0.837017
## category_1furniture -645.289 109.250 -5.907 3.50e-09 ***
## category_1medicine 13.809 18.901 0.731 0.465022
## category_2audio 92.818 10.428 8.901 < 2e-16 ***
## category_2bathroom 705.414 124.510 5.665 1.47e-08 ***
## category_2bedroom 708.926 128.098 5.534 3.13e-08 ***
## category_2camera 302.284 13.187 22.923 < 2e-16 ***
## category_2components 275.399 4.076 67.572 < 2e-16 ***
## category_2environment 134.557 14.609 9.210 < 2e-16 ***
## category_2kitchen 640.975 45.305 14.148 < 2e-16 ***
## category_2living_room 639.072 110.012 5.809 6.28e-09 ***
## category_2network 164.637 9.032 18.228 < 2e-16 ***
## category_2peripherals NA NA NA NA
## category_2personal NA NA NA NA
## category_2tools NA NA NA NA
## category_2video NA NA NA NA
## category_3air_heater -100.598 11.717 -8.586 < 2e-16 ***
## category_3alarm -37.905 58.594 -0.647 0.517685
## category_3bath NA NA NA NA
## category_3blanket NA NA NA NA
## category_3blender -579.185 44.190 -13.107 < 2e-16 ***
## category_3camera -58.854 5.996 -9.815 < 2e-16 ***
## category_3cdrw -306.248 11.446 -26.755 < 2e-16 ***
## category_3chair 53.597 51.759 1.036 0.300425
## category_3coffee_grinder -607.575 48.565 -12.511 < 2e-16 ***
## category_3coffee_machine -335.425 43.399 -7.729 1.09e-14 ***
## category_3compressor -17.115 58.662 -0.292 0.770473
## category_3cooler -359.450 3.480 -103.284 < 2e-16 ***
## category_3cpu -194.444 2.333 -83.362 < 2e-16 ***
## category_3dictaphone -49.262 19.382 -2.542 0.011034 *
## category_3drill -43.159 16.700 -2.584 0.009756 **
## category_3fan -131.401 37.660 -3.489 0.000485 ***
## category_3fryer -603.810 46.786 -12.906 < 2e-16 ***
## category_3generator 50.375 29.497 1.708 0.087671 .
## category_3gps -40.882 58.727 -0.696 0.486347
## category_3grill -590.532 43.455 -13.589 < 2e-16 ***
## category_3hair_cutter 26.141 16.755 1.560 0.118726
## category_3hdd -274.606 3.679 -74.637 < 2e-16 ***
## category_3headphone -56.335 4.377 -12.870 < 2e-16 ***
## category_3heater -76.058 34.985 -2.174 0.029706 *
## category_3immobilizer -96.295 68.700 -1.402 0.161014
## category_3joystick -9.333 5.691 -1.640 0.101032
## category_3juicer -587.769 43.707 -13.448 < 2e-16 ***
## category_3kettle -614.726 43.587 -14.103 < 2e-16 ***
## category_3keyboard -33.494 7.683 -4.359 1.30e-05 ***
## category_3light -102.844 17.658 -5.824 5.74e-09 ***
## category_3massager 31.822 17.947 1.773 0.076212 .
## category_3meat_grinder -521.713 43.544 -11.981 < 2e-16 ***
## category_3microphone -74.446 7.132 -10.438 < 2e-16 ***
## category_3microwave -507.673 43.489 -11.674 < 2e-16 ***
## category_3mixer -546.208 43.632 -12.518 < 2e-16 ***
## category_3monitor 264.850 5.681 46.623 < 2e-16 ***
## category_3motherboard -288.363 2.265 -127.305 < 2e-16 ***
## category_3mouse -69.817 6.468 -10.794 < 2e-16 ***
## category_3music_tools 322.493 16.526 19.514 < 2e-16 ***
## category_3nas 142.213 21.283 6.682 2.36e-11 ***
## category_3network_adapter -345.520 8.753 -39.473 < 2e-16 ***
## category_3painting -96.124 22.014 -4.367 1.26e-05 ***
## category_3parktronic -70.091 58.832 -1.191 0.233507
## category_3photo -299.473 23.411 -12.792 < 2e-16 ***
## category_3player 19.864 58.341 0.340 0.733500
## category_3power_supply -282.979 3.582 -79.006 < 2e-16 ***
## category_3printer 60.925 4.270 14.269 < 2e-16 ***
## category_3projector 724.593 13.215 54.832 < 2e-16 ***
## category_3pump -50.412 20.759 -2.428 0.015165 *
## category_3radar -12.267 59.818 -0.205 0.837517
## category_3refrigerators -611.263 96.211 -6.353 2.11e-10 ***
## category_3router NA NA NA NA
## category_3saw 49.828 17.232 2.892 0.003832 **
## category_3scales NA NA NA NA
## category_3scanner 106.830 8.498 12.571 < 2e-16 ***
## category_3screw -126.065 17.871 -7.054 1.74e-12 ***
## category_3shelving NA NA NA NA
## category_3soldering -70.023 16.701 -4.193 2.76e-05 ***
## category_3sound_card -290.020 8.678 -33.421 < 2e-16 ***
## category_3steam_cooker -614.139 54.633 -11.241 < 2e-16 ***
## category_3tonometer NA NA NA NA
## category_3toster -614.684 48.840 -12.586 < 2e-16 ***
## category_3tv 331.961 10.478 31.681 < 2e-16 ***
## category_3tv_remote NA NA NA NA
## category_3tv_tuner -345.003 7.218 -47.800 < 2e-16 ***
## category_3vacuum NA NA NA NA
## category_3video NA NA NA NA
## category_3videocards NA NA NA NA
## category_3videoregister -10.030 58.415 -0.172 0.863675
## category_3vr -74.122 24.133 -3.071 0.002131 **
## category_3washer NA NA NA NA
## category_3welding 27.336 17.343 1.576 0.114985
## category_3wifi NA NA NA NA
## category_3winch -87.435 148.022 -0.591 0.554732
## category_3window -58.413 61.925 -0.943 0.345532
## category_3wrench NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 333.3 on 448575 degrees of freedom
## (435810 observations deleted due to missingness)
## Multiple R-squared: 0.1404, Adjusted R-squared: 0.1402
## F-statistic: 938.9 on 78 and 448575 DF, p-value: < 2.2e-16
#This shows the distrubution by hour and by day of activity
# Disable scientific notation for clarity
options(scipen = 999)
# Load necessary library
library(ggplot2)
# Convert weekday to a factor with ordered levels
Data_Without_Top_100_Customers$weekday <- factor(Data_Without_Top_100_Customers$weekday,
levels = c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"),
ordered = TRUE)
# Filter out specific event types (e.g., "view" and "cart")
filtered_data <- Data_Without_Top_100_Customers[!(Data_Without_Top_100_Customers$event_type %in% c("view", "cart")), ]
# Ensure the 'time_hour' column is numeric
filtered_data$time_hour <- as.numeric(filtered_data$time_hour)
# Check if 'filtered_data' has rows after filtering
if (nrow(filtered_data) == 0) {
stop("Filtered dataset has no rows after filtering event types")
}
# Create the histogram
ggplot(filtered_data, aes(x = time_hour, fill = event_type)) +
geom_histogram(binwidth = 1, position = "dodge") +
facet_wrap(~ weekday, ncol = 3) +
labs(title = "Distribution of Events by Hour, Weekday, and Event Type (Excluding View and Cart)",
x = "Hour",
y = "Count",
fill = "Event Type") +
theme_minimal()
# Clean the data by removing unnecessary columns
data_cleaned <- Data_Without_Top_100_Customers[, -c(1, 3, 4, 5, 6, 8, 9, 10, 11, 12, 14, 15, 16, 17)]
# Preview the cleaned data
head
## function (x, ...)
## UseMethod("head")
## <bytecode: 0x0000029304d7a300>
## <environment: namespace:utils>
tapply(Data_Without_Top_100_Customers$price, Data_Without_Top_100_Customers$weekday, mean)
## Sunday Monday Tuesday Wednesday Thursday Friday Saturday
## 147.7764 143.7999 148.8428 144.3208 147.6150 147.5641 144.4111
tapply(Data_Without_Top_100_Customers$price,list(Data_Without_Top_100_Customers$event_type, Data_Without_Top_100_Customers$weekday), mean)
## Sunday Monday Tuesday Wednesday Thursday Friday Saturday
## cart 164.1900 154.2199 162.7951 152.3394 163.5810 160.6858 160.2764
## purchase 141.9904 135.2100 136.7100 129.9932 138.6087 140.4071 138.6811
## view 146.9437 143.4981 148.4461 144.4641 146.9411 147.0144 143.6201
# Create a boxplot and capture its statistics in 'bp'
bp <- boxplot(Data_Without_Top_100_Customers$price ~ Data_Without_Top_100_Customers$weekday,
xlab = NA, # No x-axis label
ylab = "Price", # Y-axis label
main = "Boxplot of Price by Weekday", # Title
outline = FALSE) # Remove outliers
# Add labels for mean values (3rd quartile in boxplot stats)
text(x = 1:length(bp$stats[1, ]), # X-coordinates for each box
y = bp$stats[3, ], # Y-coordinates (median or 3rd quartile)
labels = round(bp$stats[3, ], 2), # Rounded mean values
pos = 3, # Position above the box
cex = 0.7, # Text size
col = "blue") # Text color
# Load necessary library
library(ggplot2)
# Create a barplot using base R
barplot(table(Data_Without_Top_100_Customers$event_type, Data_Without_Top_100_Customers$weekday),
beside = TRUE,
legend.text = TRUE,
xlab = "Weekday",
ylab = "Frequency",
main = "Distribution of Event Types Across Weekdays")
# Create a stacked bar chart using ggplot2
ggplot(Data_Without_Top_100_Customers, aes(x = weekday, fill = event_type)) +
geom_bar(position = "stack") +
labs(title = "Event Type Distribution by Day of the Week",
x = "Weekday",
y = "Count of Events",
fill = "Event Type") +
theme_minimal()