Data Mining Business Decisions.Week3

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

data_raw <- read.csv("~/Downloads/question1data.csv")

summary(data_raw)

##      custid            sex            is.employed         income      
##  Min.   :   2068   Length:1000        Mode :logical   Min.   : -8700  
##  1st Qu.: 345667   Class :character   FALSE:73        1st Qu.: 14600  
##  Median : 693403   Mode  :character   TRUE :599       Median : 35000  
##  Mean   : 698500                      NA's :328       Mean   : 53505  
##  3rd Qu.:1044606                                      3rd Qu.: 67000  
##  Max.   :1414286                                      Max.   :615000  
##                                                                       
##  marital.stat       health.ins      housing.type       recent.move    
##  Length:1000        Mode :logical   Length:1000        Mode :logical  
##  Class :character   FALSE:159       Class :character   FALSE:820      
##  Mode  :character   TRUE :841       Mode  :character   TRUE :124      
##                                                        NA's :56       
##                                                                       
##                                                                       
##                                                                       
##   num.vehicles        age        state.of.res      
##  Min.   :0.000   Min.   :  0.0   Length:1000       
##  1st Qu.:1.000   1st Qu.: 38.0   Class :character  
##  Median :2.000   Median : 50.0   Mode  :character  
##  Mean   :1.916   Mean   : 51.7                     
##  3rd Qu.:2.000   3rd Qu.: 64.0                     
##  Max.   :6.000   Max.   :146.7                     
##  NA's   :56

data_clean <- data_raw %>%
  mutate(
    income = if_else(income <= 0, NA_real_, as.numeric(income)),
    age = as.numeric(age),
    age = if_else(age < 18 | age > 100, NA_real_, age),
    state.of.res = as.factor(state.of.res),
    housing.type = as.factor(housing.type),
    sex = as.factor(sex),
    marital.stat = as.factor(marital.stat)
  )

# Check missing values after cleaning
data_clean %>% summarise(across(everything(), ~ sum(is.na(.))))

state_counts <- data_clean %>%
  filter(!is.na(state.of.res)) %>%
  count(state.of.res, name = "n") %>%
  arrange(desc(n))

ggplot(state_counts, aes(x = reorder(state.of.res, n), y = n)) +
  geom_col(fill = "steelblue") +
  coord_flip() +
  labs(
    title = "Customer Distribution by State",
    x = "State",
    y = "Number of Customers"
  ) +
  theme_minimal()

This distribution suggests that marketing and sales efforts should focus more on high-customer states such as California and New York, while niche or targeted strategies may be more suitable for lower-customer states.

age_income <- data_clean %>%
  filter(!is.na(age), !is.na(income))

ggplot(age_income, aes(x = age, y = income)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", se = TRUE) +
  labs(
    title = "Age vs Income (with Trend Line)",
    x = "Age",
    y = "Income"
  ) +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

The plot shows a weak positive relationship between age and income. Most customers have moderate income levels across all age groups, with a few high-income outliers mainly in middle age ranges.

age_income_housing <- data_clean %>%
  filter(!is.na(age), !is.na(income), !is.na(housing.type))

ggplot(age_income_housing, aes(x = age, y = income, color = housing.type)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", se = FALSE) +
  labs(
    title = "Age vs Income by Housing Type",
    x = "Age",
    y = "Income",
    color = "Housing Type"
  ) +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

ggplot(age_income_housing, aes(x = age, y = income)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", se = FALSE) +
  facet_wrap(~ housing.type) +
  labs(
    title = "Age vs Income (Faceted by Housing Type)",
    x = "Age",
    y = "Income"
  ) +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

laptop <- read.csv("~/Downloads/LaptopSalesJanuary2008.csv", check.names = TRUE)
colnames(laptop)

##  [1] "Date"                   "Configuration"          "Customer.Postcode"     
##  [4] "Store.Postcode"         "Retail.Price"           "Screen.Size..Inches."  
##  [7] "Battery.Life..Hours."   "RAM..GB."               "Processor.Speeds..GHz."
## [10] "Integrated.Wireless."   "HD.Size..GB."           "Bundled.Applications." 
## [13] "OS.X.Customer"          "OS.Y.Customer"          "OS.X.Store"            
## [16] "OS.Y.Store"             "CustomerStoreDistance"

# Detect columns robustly
store_col_candidates <- c("Store.Postcode","Store_Postcode","StorePostcode")
price_col_candidates <- c("Retail.Price","Retail_Price","RetailPrice")

store_col <- intersect(store_col_candidates, colnames(laptop))[1]
price_col <- intersect(price_col_candidates, colnames(laptop))[1]

if (is.na(store_col) || is.na(price_col)) {
  store_col <- colnames(laptop)[str_detect(colnames(laptop), regex("store.*post", ignore_case = TRUE))][1]
  price_col <- colnames(laptop)[str_detect(colnames(laptop), regex("retail.*price", ignore_case = TRUE))][1]
}

if (is.na(store_col) || is.na(price_col)) {
  stop("Could not find Store Postcode and Retail Price columns. Check colnames(laptop).")
}

laptop_clean <- laptop %>%
  filter(!is.na(.data[[store_col]]), !is.na(.data[[price_col]]))

avg_price_store <- laptop_clean %>%
  group_by(store = .data[[store_col]]) %>%
  summarise(avg_price = mean(.data[[price_col]], na.rm = TRUE), .groups = "drop") %>%
  arrange(desc(avg_price))

highest_store <- avg_price_store %>% slice(1)
lowest_store  <- avg_price_store %>% slice(n())

highest_store

lowest_store

ggplot(avg_price_store, aes(x = reorder(store, avg_price), y = avg_price)) +
  geom_col(fill = "darkgreen") +
  coord_flip() +
  labs(
    title = "Average Retail Price by Store",
    x = "Store Postcode",
    y = "Average Retail Price"
  ) +
  theme_minimal()

The bar chart shows the average retail price of laptops across store locations. The store with postcode N17 6QA has the highest average retail price, while the store with postcode W4 3PH has the lowest average retail price. This indicates pricing variation across store locations.

ggplot(laptop_clean, aes(x = as.factor(.data[[store_col]]), y = .data[[price_col]])) +
  geom_boxplot(fill = "orange", alpha = 0.7) +
  labs(
    title = "Retail Price Distribution by Store",
    x = "Store Postcode",
    y = "Retail Price"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

The boxplots show the distribution of laptop retail prices across different store locations. The store with the highest average retail price (N17 6QA) shows a slightly higher median price and overall price distribution compared to other stores. The store with the lowest average retail price (W4 3PH) shows a relatively lower median and slightly lower overall distribution.

data("USArrests")

us_data <- USArrests %>%
  as.data.frame() %>%
  rownames_to_column("State")

glimpse(us_data)

## Rows: 50
## Columns: 5
## $ State    <chr> "Alabama", "Alaska", "Arizona", "Arkansas", "California", "Co…
## $ Murder   <dbl> 13.2, 10.0, 8.1, 8.8, 9.0, 7.9, 3.3, 5.9, 15.4, 17.4, 5.3, 2.…
## $ Assault  <int> 236, 263, 294, 190, 276, 204, 110, 238, 335, 211, 46, 120, 24…
## $ UrbanPop <int> 58, 48, 80, 50, 91, 78, 77, 72, 80, 60, 83, 54, 83, 65, 57, 6…
## $ Rape     <dbl> 21.2, 44.5, 31.0, 19.5, 40.6, 38.7, 11.1, 15.8, 31.9, 25.8, 2…

summary(us_data)

##     State               Murder          Assault         UrbanPop    
##  Length:50          Min.   : 0.800   Min.   : 45.0   Min.   :32.00  
##  Class :character   1st Qu.: 4.075   1st Qu.:109.0   1st Qu.:54.50  
##  Mode  :character   Median : 7.250   Median :159.0   Median :66.00  
##                     Mean   : 7.788   Mean   :170.8   Mean   :65.54  
##                     3rd Qu.:11.250   3rd Qu.:249.0   3rd Qu.:77.75  
##                     Max.   :17.400   Max.   :337.0   Max.   :91.00  
##       Rape      
##  Min.   : 7.30  
##  1st Qu.:15.07  
##  Median :20.10  
##  Mean   :21.23  
##  3rd Qu.:26.18  
##  Max.   :46.00

top_murder <- us_data %>%
  arrange(desc(Murder)) %>%
  head(10)

top_murder

ggplot(top_murder, aes(x = reorder(State, Murder), y = Murder)) +
  geom_col(fill = "red") +
  coord_flip() +
  labs(
    title = "Top 10 States by Murder Rate",
    x = "State",
    y = "Murder Rate"
  ) +
  theme_minimal()

ggplot(us_data, aes(x = UrbanPop, y = Assault)) +
  geom_point(color = "blue") +
  geom_smooth(method = "lm") +
  labs(
    title = "Assault Rate vs Urban Population",
    x = "Urban Population (%)",
    y = "Assault Rate"
  ) +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

ggplot(us_data, aes(y = Rape)) +
  geom_boxplot(fill = "purple") +
  labs(
    title = "Distribution of Rape Rates Across States",
    y = "Rape Rate"
  ) +
  theme_minimal()

Data Mining Business Decisions.Week3

Praveen Singh

2026-02-09