install.packages("tidyverse")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Load dplyr
install.packages("dplyr")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(dplyr)
# Load ggplot2
install.packages("ggplot2")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(ggplot2)
# Load after sales dataset
after_sales <- read.csv("/cloud/project/solar/after_sales.csv")
colnames(after_sales)
## [1] "service_id" "sale_id" "client_id" "service_type" "service_date"
# Load clients dataset
clients <- read.csv("/cloud/project/solar/clients.csv")
colnames(clients)
## [1] "client_id" "name" "gender" "age"
## [5] "location" "income" "marital_status"
# Load payments dataset
payments <- read.csv("/cloud/project/solar/payments.csv")
colnames(payments)
## [1] "payment_id" "sale_id" "amount_paid" "payment_date" "defaulted"
# Load sales dataset
sales <- read.csv("/cloud/project/solar/sales.csv")
colnames(sales)
## [1] "sale_id" "client_id" "product_type" "product_category"
## [5] "pay_model" "sale_date"
# Load dplyr and ggpplot
library(dplyr)
library(ggplot2)
# What is the client breakdown by gender, age and location?
demographic <- clients %>%
group_by(gender,location)%>%
summarise(avg_age = mean(age,na.rm=TRUE),.groups='drop')
print (demographic)
## # A tibble: 18 × 3
## gender location avg_age
## <chr> <chr> <dbl>
## 1 Female " Rural " 45
## 2 Female " Suburban " 39
## 3 Female " Urban " 31
## 4 Female "Rural" 41.2
## 5 Female "Suburban" 42.6
## 6 Female "Urban" 42.3
## 7 Male " Rural " 39.8
## 8 Male " Suburban " 30.5
## 9 Male " Urban " 31.2
## 10 Male "Rural" 38.2
## 11 Male "Suburban" 34.5
## 12 Male "Urban" 32.4
## 13 Other " Rural " 35.5
## 14 Other " Suburban " 38.2
## 15 Other " Urban " 37
## 16 Other "Rural" 34.8
## 17 Other "Suburban" 49
## 18 Other "Urban" 40.8
# Visualize output
ggplot(data=demographic, mapping=aes(x=location,y=avg_age,fill=gender))+
geom_col(color='black')

# Most common service types in the after-sales records?
common_service <- after_sales %>%
group_by(service_type)%>%
summarise(service_count = n())%>%
arrange(desc(service_count))
print(common_service)
## # A tibble: 6 × 2
## service_type service_count
## <chr> <int>
## 1 "Installation" 34
## 2 "Maintenance" 24
## 3 "Repair" 19
## 4 " Maintenance " 8
## 5 " Repair " 8
## 6 " Installation " 7
# Visualize output
ggplot(data=common_service,mapping=aes(x=service_type,y=service_count))+
geom_bar(stat='identity', color='blue') +
labs(title='Most Common Service Types') + theme_minimal()

# Total number of sales and after-sales services completed
# join payment, sales and after sales tables
sales_payments <- left_join(sales,payments,by='sale_id')
# join sales_payments table to after sales table
sales_payments_aftersales <- left_join(sales_payments,after_sales,by='sale_id')
## Warning in left_join(sales_payments, after_sales, by = "sale_id"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 14 of `x` matches multiple rows in `y`.
## ℹ Row 71 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
# establish total sales and after sales services
total <- sales_payments_aftersales %>%
group_by (product_type) %>%
summarise(total_amount = sum(amount_paid,na.rm=TRUE),
service_count = n()
,.groups='drop')%>%
arrange(desc(total_amount))
print(total)
## # A tibble: 6 × 3
## product_type total_amount service_count
## <chr> <dbl> <int>
## 1 "Solar Home System" 21474. 76
## 2 "Solar Water Pump" 14511. 47
## 3 "Solar Lantern" 14143. 47
## 4 " Solar Lantern " 12475. 35
## 5 " Solar Water Pump " 8211. 24
## 6 " Solar Home System " 2253. 9
# visualize output
ggplot(data=total,mapping=aes(x=product_type,y=total_amount,fill=service_count))+
geom_col() + labs(title='Total Sales & After sales Count') +
theme(axis.text.x = element_text(angle=45,hjust=1))

# Are there any sales that have no associated payments?
# If so, how many?
null_sales<-sales_payments %>%
filter(is.na(amount_paid)) %>%
group_by(sale_id)%>%
summarise(sales_count=n())%>%
arrange(desc(sales_count))%>%
slice(1:5)
print(null_sales)
## # A tibble: 5 × 2
## sale_id sales_count
## <int> <int>
## 1 32 2
## 2 110 2
## 3 1 1
## 4 4 1
## 5 5 1
# Which pay model is most commonly used across sales transactions?
top_model<- sales_payments %>%
group_by(pay_model) %>%
summarise(model_count=n())%>%
arrange(desc(model_count))
print(top_model)
## # A tibble: 2 × 2
## pay_model model_count
## <chr> <int>
## 1 Direct Cash 112
## 2 PAYGO 89
# visualize output
ggplot(data=top_model, mapping=aes(x=pay_model, y=model_count))+
geom_col(color='black', fill='lightblue') +
labs(title='Pay Model Usage')

# What is the average time between a sale and its payment date?
# Does this vary significantly by product type or pay model?
# step1: ensure the dates are in the correct format
sales_payments <- sales_payments %>%
mutate(sale_date=as.Date(sale_date),payment_date=as.Date(payment_date))
# step2:calculate time difference between sales date and payment date
sales_payments <- sales_payments %>%
mutate(time_diff=as.numeric(difftime(payment_date,sale_date,units='days')))
# step3: calculate average time difference by product
avg_time_product <- sales_payments %>%
group_by (product_type)%>%
summarise(avg_time = mean(time_diff,na.rm=TRUE),.groups='drop')
print(avg_time_product)
## # A tibble: 6 × 2
## product_type avg_time
## <chr> <dbl>
## 1 " Solar Home System " -57.3
## 2 " Solar Lantern " 8.27
## 3 " Solar Water Pump " 12.1
## 4 "Solar Home System" -21.6
## 5 "Solar Lantern" 32.6
## 6 "Solar Water Pump" -6.70
# visualize output above
ggplot(data=avg_time_product, mapping=aes(x=product_type,y=avg_time)) +
geom_col(fill='black')+theme(axis.text.x = element_text(angle=45,hjust=1))+
labs(title='Average Time difference by Product')

# step4: calculate average time difference by pay model
avg_time_paymodel <- sales_payments %>%
group_by (pay_model)%>%
summarise(avg_time = mean(time_diff,na.rm=TRUE),.groups='drop')
print(avg_time_paymodel)
## # A tibble: 2 × 2
## pay_model avg_time
## <chr> <dbl>
## 1 Direct Cash 5.91
## 2 PAYGO -8.61
# visualize output above
ggplot(data=avg_time_paymodel, mapping=aes(x=pay_model,y=avg_time)) +
geom_col(fill='blue')+theme(axis.text.x = element_text(angle=45,hjust=1))+
labs(title='Average Time difference by PayModel')

# What is the average amount paid and how does it vary across client income levels?
# step1: create income categories
income_group <- cut(clients$income,
breaks=c(5000,10000,20000,30000,40000,50000),
labels=c("5000-10000","10001-20000","20001-30000","30001-40000","40001-50000"),
include.lowest='TRUE')
# step2: create new income category column in client table
clients <- clients %>%
mutate(Income_Group = income_group)
# step4: join the salespayment table to the client tables
sales_payments_clients <- left_join(sales_payments,clients,by='client_id')
#step5: calculate the average amount paid
average_amount <- sales_payments_clients %>%
group_by (Income_Group)%>%
summarise(avg_amount_paid=mean(amount_paid,na.rm=TRUE),.groups='drop')
print(average_amount)
## # A tibble: 6 × 2
## Income_Group avg_amount_paid
## <fct> <dbl>
## 1 5000-10000 437.
## 2 10001-20000 483.
## 3 20001-30000 479.
## 4 30001-40000 594.
## 5 40001-50000 532.
## 6 <NA> 532.
# visualize output above
ggplot(data=average_amount,mapping=aes(x=Income_Group,y=avg_amount_paid,fill=Income_Group))+
geom_col(color='black')+theme(axis.text.x = element_text(angle=45,hjust=1))+
labs(title='Average amount Vs Income')

# What is the distribution of service types provided in after-sales support?
ggplot(data=after_sales,mapping=aes(x=service_type))+
geom_bar(binwidth=5, fill='lightblue',color='black')+
labs(title='Distribution of Service Types')
## Warning in geom_bar(binwidth = 5, fill = "lightblue", color = "black"):
## Ignoring unknown parameters: `binwidth`

# Is there a relationship between product type and the type of after-sales service requested?
# step1: join the sales and after sales table
sales_after_sales <- left_join(sales,after_sales,by='sale_id')
# step2: establish relationship
relationship <- sales_after_sales %>%
filter(!is.na(product_type) & !is.na(service_type))%>%
group_by(product_type,service_type)%>%
summarise(service_count=n(),.groups='drop')
print(relationship)
## # A tibble: 30 × 3
## product_type service_type service_count
## <chr> <chr> <int>
## 1 " Solar Home System " " Maintenance " 1
## 2 " Solar Home System " "Installation" 2
## 3 " Solar Home System " "Repair" 1
## 4 " Solar Lantern " " Installation " 2
## 5 " Solar Lantern " " Maintenance " 3
## 6 " Solar Lantern " "Installation" 8
## 7 " Solar Lantern " "Maintenance" 2
## 8 " Solar Lantern " "Repair" 1
## 9 " Solar Water Pump " " Maintenance " 1
## 10 " Solar Water Pump " "Installation" 2
## # ℹ 20 more rows
#step3:visualize the results above
ggplot(relationship, aes(x = product_type, y = service_type, fill = service_count)) +
geom_tile(color = 'white') +
scale_fill_gradient(low = "lightblue", high = "blue") +
labs(title = 'Product Types vs After-Sales Service Types') +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

# What is the gender distribution of clients across different payment models?
# step1: create a client sales table
client_sales <- left_join(clients,sales,by='client_id')
# step2: calculate the client count across payment models
gender_distribution <- client_sales %>%
filter(!is.na(pay_model))%>%
group_by(pay_model, gender)%>%
summarise(gender_count=n(),.groups='drop')
print(gender_distribution)
## # A tibble: 6 × 3
## pay_model gender gender_count
## <chr> <chr> <int>
## 1 Direct Cash Female 30
## 2 Direct Cash Male 29
## 3 Direct Cash Other 23
## 4 PAYGO Female 16
## 5 PAYGO Male 21
## 6 PAYGO Other 31
# step2: visualize the distribution
ggplot(gender_distribution,aes(x=pay_model,y=gender_count,fill=gender))+
geom_col(color='black')+labs(title='Gender Distribution by Payment Model')

# What are the peak months or seasons for payments?
# step1:calculate the peak sales period
peak_payments <- payments %>%
filter(!is.na(payment_date))%>%
group_by (payment_date)%>%
summarise(total_amount=sum(amount_paid),.group='drop')%>%
arrange(desc(total_amount))%>%
slice(1:5)
print(peak_payments)
## # A tibble: 5 × 3
## payment_date total_amount .group
## <chr> <dbl> <chr>
## 1 2022-02-27 978. drop
## 2 2022-03-12 965. drop
## 3 2022-05-12 953 drop
## 4 2022-02-12 941. drop
## 5 2022-03-31 908. drop
# how does product category usage vary by marital status
#re-load tidyverse and dplyr
library(tidyverse)
library(dplyr)
#step1: join the sales table and clients table
sales_clients <- left_join(sales,clients,by='client_id')
#step2: aggregate product category and group by marital status
category_usage <- sales_clients %>%
group_by(marital_status,product_category)%>%
summarise(category_count=n(),.groups='drop')%>%
mutate(percentage_category=category_count/sum(category_count)*100)
print(category_usage)
## # A tibble: 29 × 4
## marital_status product_category category_count percentage_category
## <chr> <chr> <int> <dbl>
## 1 " Divorced " " Home Use " 1 0.667
## 2 " Divorced " " Productive Use " 1 0.667
## 3 " Divorced " "Home Use" 3 2
## 4 " Married " "Home Use" 1 0.667
## 5 " Married " "Productive Use" 2 1.33
## 6 " Single " " Home Use " 3 2
## 7 " Single " " Productive Use " 2 1.33
## 8 " Single " "Home Use" 7 4.67
## 9 " Single " "Productive Use" 9 6
## 10 " Widowed " " Home Use " 4 2.67
## # ℹ 19 more rows
# visualize output
ggplot(category_usage,aes(x=product_category,y=marital_status,fill=category_count))+
geom_tile(color='white')+
scale_fill_gradient(low='lightgreen',high='darkgreen')+
labs(title='Category Usage by Marital Status')+
theme(axis.text.x = element_text(angle=45,hjust=1))

# How do default rates vary by the different pay models?
default_rates <-sales_payments %>%
filter(!is.na(defaulted))%>%
group_by(pay_model,defaulted)%>%
summarise(default_count=n(),.groups='drop')%>%
mutate(default_rate=default_count/sum(default_count)*100)
print(default_rates)
## # A tibble: 4 × 4
## pay_model defaulted default_count default_rate
## <chr> <chr> <int> <dbl>
## 1 Direct Cash No 41 27.3
## 2 Direct Cash Yes 39 26
## 3 PAYGO No 26 17.3
## 4 PAYGO Yes 44 29.3
# visualize output
ggplot(default_rates,aes(x=pay_model,y=default_rate,fill=defaulted))+
geom_col() + labs(title='Default rate by Pay Model')
