Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.
data_raw <- read.csv("~/Downloads/question1data.csv")
summary(data_raw)
## custid sex is.employed income
## Min. : 2068 Length:1000 Mode :logical Min. : -8700
## 1st Qu.: 345667 Class :character FALSE:73 1st Qu.: 14600
## Median : 693403 Mode :character TRUE :599 Median : 35000
## Mean : 698500 NA's :328 Mean : 53505
## 3rd Qu.:1044606 3rd Qu.: 67000
## Max. :1414286 Max. :615000
##
## marital.stat health.ins housing.type recent.move
## Length:1000 Mode :logical Length:1000 Mode :logical
## Class :character FALSE:159 Class :character FALSE:820
## Mode :character TRUE :841 Mode :character TRUE :124
## NA's :56
##
##
##
## num.vehicles age state.of.res
## Min. :0.000 Min. : 0.0 Length:1000
## 1st Qu.:1.000 1st Qu.: 38.0 Class :character
## Median :2.000 Median : 50.0 Mode :character
## Mean :1.916 Mean : 51.7
## 3rd Qu.:2.000 3rd Qu.: 64.0
## Max. :6.000 Max. :146.7
## NA's :56
data_clean <- data_raw %>%
mutate(
income = if_else(income <= 0, NA_real_, as.numeric(income)),
age = as.numeric(age),
age = if_else(age < 18 | age > 100, NA_real_, age),
state.of.res = as.factor(state.of.res),
housing.type = as.factor(housing.type),
sex = as.factor(sex),
marital.stat = as.factor(marital.stat)
)
# Check missing values after cleaning
data_clean %>% summarise(across(everything(), ~ sum(is.na(.))))
state_counts <- data_clean %>%
filter(!is.na(state.of.res)) %>%
count(state.of.res, name = "n") %>%
arrange(desc(n))
ggplot(state_counts, aes(x = reorder(state.of.res, n), y = n)) +
geom_col(fill = "steelblue") +
coord_flip() +
labs(
title = "Customer Distribution by State",
x = "State",
y = "Number of Customers"
) +
theme_minimal()
This distribution suggests that marketing and sales efforts should focus more on high-customer states such as California and New York, while niche or targeted strategies may be more suitable for lower-customer states.
age_income <- data_clean %>%
filter(!is.na(age), !is.na(income))
ggplot(age_income, aes(x = age, y = income)) +
geom_point(alpha = 0.6) +
geom_smooth(method = "lm", se = TRUE) +
labs(
title = "Age vs Income (with Trend Line)",
x = "Age",
y = "Income"
) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
The plot shows a weak positive relationship between age and income. Most customers have moderate income levels across all age groups, with a few high-income outliers mainly in middle age ranges.
age_income_housing <- data_clean %>%
filter(!is.na(age), !is.na(income), !is.na(housing.type))
ggplot(age_income_housing, aes(x = age, y = income, color = housing.type)) +
geom_point(alpha = 0.6) +
geom_smooth(method = "lm", se = FALSE) +
labs(
title = "Age vs Income by Housing Type",
x = "Age",
y = "Income",
color = "Housing Type"
) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
ggplot(age_income_housing, aes(x = age, y = income)) +
geom_point(alpha = 0.6) +
geom_smooth(method = "lm", se = FALSE) +
facet_wrap(~ housing.type) +
labs(
title = "Age vs Income (Faceted by Housing Type)",
x = "Age",
y = "Income"
) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
laptop <- read.csv("~/Downloads/LaptopSalesJanuary2008.csv", check.names = TRUE)
colnames(laptop)
## [1] "Date" "Configuration" "Customer.Postcode"
## [4] "Store.Postcode" "Retail.Price" "Screen.Size..Inches."
## [7] "Battery.Life..Hours." "RAM..GB." "Processor.Speeds..GHz."
## [10] "Integrated.Wireless." "HD.Size..GB." "Bundled.Applications."
## [13] "OS.X.Customer" "OS.Y.Customer" "OS.X.Store"
## [16] "OS.Y.Store" "CustomerStoreDistance"
# Detect columns robustly
store_col_candidates <- c("Store.Postcode","Store_Postcode","StorePostcode")
price_col_candidates <- c("Retail.Price","Retail_Price","RetailPrice")
store_col <- intersect(store_col_candidates, colnames(laptop))[1]
price_col <- intersect(price_col_candidates, colnames(laptop))[1]
if (is.na(store_col) || is.na(price_col)) {
store_col <- colnames(laptop)[str_detect(colnames(laptop), regex("store.*post", ignore_case = TRUE))][1]
price_col <- colnames(laptop)[str_detect(colnames(laptop), regex("retail.*price", ignore_case = TRUE))][1]
}
if (is.na(store_col) || is.na(price_col)) {
stop("Could not find Store Postcode and Retail Price columns. Check colnames(laptop).")
}
laptop_clean <- laptop %>%
filter(!is.na(.data[[store_col]]), !is.na(.data[[price_col]]))
avg_price_store <- laptop_clean %>%
group_by(store = .data[[store_col]]) %>%
summarise(avg_price = mean(.data[[price_col]], na.rm = TRUE), .groups = "drop") %>%
arrange(desc(avg_price))
highest_store <- avg_price_store %>% slice(1)
lowest_store <- avg_price_store %>% slice(n())
highest_store
lowest_store
ggplot(avg_price_store, aes(x = reorder(store, avg_price), y = avg_price)) +
geom_col(fill = "darkgreen") +
coord_flip() +
labs(
title = "Average Retail Price by Store",
x = "Store Postcode",
y = "Average Retail Price"
) +
theme_minimal()
The bar chart shows the average retail price of laptops across store
locations. The store with postcode N17 6QA has the highest average
retail price, while the store with postcode W4 3PH has the lowest
average retail price. This indicates pricing variation across store
locations.
ggplot(laptop_clean, aes(x = as.factor(.data[[store_col]]), y = .data[[price_col]])) +
geom_boxplot(fill = "orange", alpha = 0.7) +
labs(
title = "Retail Price Distribution by Store",
x = "Store Postcode",
y = "Retail Price"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
The boxplots show the distribution of laptop retail prices across different store locations. The store with the highest average retail price (N17 6QA) shows a slightly higher median price and overall price distribution compared to other stores. The store with the lowest average retail price (W4 3PH) shows a relatively lower median and slightly lower overall distribution.
data("USArrests")
us_data <- USArrests %>%
as.data.frame() %>%
rownames_to_column("State")
glimpse(us_data)
## Rows: 50
## Columns: 5
## $ State <chr> "Alabama", "Alaska", "Arizona", "Arkansas", "California", "Co…
## $ Murder <dbl> 13.2, 10.0, 8.1, 8.8, 9.0, 7.9, 3.3, 5.9, 15.4, 17.4, 5.3, 2.…
## $ Assault <int> 236, 263, 294, 190, 276, 204, 110, 238, 335, 211, 46, 120, 24…
## $ UrbanPop <int> 58, 48, 80, 50, 91, 78, 77, 72, 80, 60, 83, 54, 83, 65, 57, 6…
## $ Rape <dbl> 21.2, 44.5, 31.0, 19.5, 40.6, 38.7, 11.1, 15.8, 31.9, 25.8, 2…
summary(us_data)
## State Murder Assault UrbanPop
## Length:50 Min. : 0.800 Min. : 45.0 Min. :32.00
## Class :character 1st Qu.: 4.075 1st Qu.:109.0 1st Qu.:54.50
## Mode :character Median : 7.250 Median :159.0 Median :66.00
## Mean : 7.788 Mean :170.8 Mean :65.54
## 3rd Qu.:11.250 3rd Qu.:249.0 3rd Qu.:77.75
## Max. :17.400 Max. :337.0 Max. :91.00
## Rape
## Min. : 7.30
## 1st Qu.:15.07
## Median :20.10
## Mean :21.23
## 3rd Qu.:26.18
## Max. :46.00
top_murder <- us_data %>%
arrange(desc(Murder)) %>%
head(10)
top_murder
ggplot(top_murder, aes(x = reorder(State, Murder), y = Murder)) +
geom_col(fill = "red") +
coord_flip() +
labs(
title = "Top 10 States by Murder Rate",
x = "State",
y = "Murder Rate"
) +
theme_minimal()
ggplot(us_data, aes(x = UrbanPop, y = Assault)) +
geom_point(color = "blue") +
geom_smooth(method = "lm") +
labs(
title = "Assault Rate vs Urban Population",
x = "Urban Population (%)",
y = "Assault Rate"
) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
ggplot(us_data, aes(y = Rape)) +
geom_boxplot(fill = "purple") +
labs(
title = "Distribution of Rape Rates Across States",
y = "Rape Rate"
) +
theme_minimal()