install.packages("tidyverse")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Load dplyr
install.packages("dplyr")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(dplyr)

# Load ggplot2
install.packages("ggplot2")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(ggplot2)
# Load after sales dataset
after_sales <- read.csv("/cloud/project/solar/after_sales.csv")
colnames(after_sales)
## [1] "service_id"   "sale_id"      "client_id"    "service_type" "service_date"
# Load clients dataset
clients <- read.csv("/cloud/project/solar/clients.csv")
colnames(clients)
## [1] "client_id"      "name"           "gender"         "age"           
## [5] "location"       "income"         "marital_status"
# Load payments dataset
payments <- read.csv("/cloud/project/solar/payments.csv")
colnames(payments)
## [1] "payment_id"   "sale_id"      "amount_paid"  "payment_date" "defaulted"
# Load sales dataset
sales <- read.csv("/cloud/project/solar/sales.csv")
colnames(sales)
## [1] "sale_id"          "client_id"        "product_type"     "product_category"
## [5] "pay_model"        "sale_date"
# Load dplyr and ggpplot
library(dplyr)
library(ggplot2)

# What is the client breakdown by gender, age and location? 
demographic <- clients %>%
  group_by(gender,location)%>%
  summarise(avg_age = mean(age,na.rm=TRUE),.groups='drop')
print (demographic)
## # A tibble: 18 × 3
##    gender location       avg_age
##    <chr>  <chr>            <dbl>
##  1 Female "  Rural  "       45  
##  2 Female "  Suburban  "    39  
##  3 Female "  Urban  "       31  
##  4 Female "Rural"           41.2
##  5 Female "Suburban"        42.6
##  6 Female "Urban"           42.3
##  7 Male   "  Rural  "       39.8
##  8 Male   "  Suburban  "    30.5
##  9 Male   "  Urban  "       31.2
## 10 Male   "Rural"           38.2
## 11 Male   "Suburban"        34.5
## 12 Male   "Urban"           32.4
## 13 Other  "  Rural  "       35.5
## 14 Other  "  Suburban  "    38.2
## 15 Other  "  Urban  "       37  
## 16 Other  "Rural"           34.8
## 17 Other  "Suburban"        49  
## 18 Other  "Urban"           40.8
# Visualize output
ggplot(data=demographic, mapping=aes(x=location,y=avg_age,fill=gender))+
  geom_col(color='black')

# Most common service types in the after-sales records?
common_service <- after_sales %>%
  group_by(service_type)%>%
  summarise(service_count = n())%>%
  arrange(desc(service_count))
print(common_service)
## # A tibble: 6 × 2
##   service_type       service_count
##   <chr>                      <int>
## 1 "Installation"                34
## 2 "Maintenance"                 24
## 3 "Repair"                      19
## 4 "  Maintenance  "              8
## 5 "  Repair  "                   8
## 6 "  Installation  "             7
# Visualize output
ggplot(data=common_service,mapping=aes(x=service_type,y=service_count))+
  geom_bar(stat='identity', color='blue') +
  labs(title='Most Common Service Types') + theme_minimal()

# Total number of sales and after-sales services completed

# join payment, sales and after sales tables
sales_payments <- left_join(sales,payments,by='sale_id')

# join sales_payments table to after sales table
sales_payments_aftersales <- left_join(sales_payments,after_sales,by='sale_id')
## Warning in left_join(sales_payments, after_sales, by = "sale_id"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 14 of `x` matches multiple rows in `y`.
## ℹ Row 71 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
# establish total sales and after sales services
total <- sales_payments_aftersales %>%
  group_by (product_type) %>%
  summarise(total_amount = sum(amount_paid,na.rm=TRUE),
            service_count = n()
            ,.groups='drop')%>%
  arrange(desc(total_amount))
print(total)
## # A tibble: 6 × 3
##   product_type            total_amount service_count
##   <chr>                          <dbl>         <int>
## 1 "Solar Home System"           21474.            76
## 2 "Solar Water Pump"            14511.            47
## 3 "Solar Lantern"               14143.            47
## 4 "  Solar Lantern  "           12475.            35
## 5 "  Solar Water Pump  "         8211.            24
## 6 "  Solar Home System  "        2253.             9
# visualize output
ggplot(data=total,mapping=aes(x=product_type,y=total_amount,fill=service_count))+
  geom_col() + labs(title='Total Sales & After sales Count') +
  theme(axis.text.x = element_text(angle=45,hjust=1))

# Are there any sales that have no associated payments? 
# If so, how many?

null_sales<-sales_payments %>%
  filter(is.na(amount_paid)) %>%
  group_by(sale_id)%>%
  summarise(sales_count=n())%>%
   arrange(desc(sales_count))%>%
  slice(1:5)
print(null_sales)
## # A tibble: 5 × 2
##   sale_id sales_count
##     <int>       <int>
## 1      32           2
## 2     110           2
## 3       1           1
## 4       4           1
## 5       5           1
# Which pay model is most commonly used across sales transactions?
top_model<- sales_payments %>%
  group_by(pay_model) %>%
  summarise(model_count=n())%>%
  arrange(desc(model_count))
print(top_model)
## # A tibble: 2 × 2
##   pay_model   model_count
##   <chr>             <int>
## 1 Direct Cash         112
## 2 PAYGO                89
# visualize output
ggplot(data=top_model, mapping=aes(x=pay_model, y=model_count))+
  geom_col(color='black', fill='lightblue') +
  labs(title='Pay Model Usage')

# What is the average time between a sale and its payment date?
# Does this vary significantly by product type or pay model?

# step1: ensure the dates are in the correct format
sales_payments <- sales_payments %>%
  mutate(sale_date=as.Date(sale_date),payment_date=as.Date(payment_date))

# step2:calculate time difference between sales date and payment date
sales_payments <- sales_payments %>%
  mutate(time_diff=as.numeric(difftime(payment_date,sale_date,units='days')))

# step3: calculate average time difference by product
avg_time_product <- sales_payments %>%
  group_by (product_type)%>%
  summarise(avg_time = mean(time_diff,na.rm=TRUE),.groups='drop')
print(avg_time_product)
## # A tibble: 6 × 2
##   product_type            avg_time
##   <chr>                      <dbl>
## 1 "  Solar Home System  "   -57.3 
## 2 "  Solar Lantern  "         8.27
## 3 "  Solar Water Pump  "     12.1 
## 4 "Solar Home System"       -21.6 
## 5 "Solar Lantern"            32.6 
## 6 "Solar Water Pump"         -6.70
# visualize output above
ggplot(data=avg_time_product, mapping=aes(x=product_type,y=avg_time)) + 
  geom_col(fill='black')+theme(axis.text.x = element_text(angle=45,hjust=1))+
  labs(title='Average Time difference by Product')

# step4: calculate average time difference by pay model
avg_time_paymodel <- sales_payments %>%
  group_by (pay_model)%>%
  summarise(avg_time = mean(time_diff,na.rm=TRUE),.groups='drop')
print(avg_time_paymodel)
## # A tibble: 2 × 2
##   pay_model   avg_time
##   <chr>          <dbl>
## 1 Direct Cash     5.91
## 2 PAYGO          -8.61
# visualize output above
ggplot(data=avg_time_paymodel, mapping=aes(x=pay_model,y=avg_time)) + 
  geom_col(fill='blue')+theme(axis.text.x = element_text(angle=45,hjust=1))+
  labs(title='Average Time difference by PayModel')

# What is the average amount paid and how does it vary across client income levels?

# step1: create income categories
income_group <- cut(clients$income,
                    breaks=c(5000,10000,20000,30000,40000,50000),
                    labels=c("5000-10000","10001-20000","20001-30000","30001-40000","40001-50000"),
                    include.lowest='TRUE')

# step2: create new income category column in client table
clients <- clients %>%
  mutate(Income_Group = income_group)

# step4: join the salespayment table to the client tables
sales_payments_clients <- left_join(sales_payments,clients,by='client_id')

#step5: calculate the average amount paid
average_amount <- sales_payments_clients %>%
  group_by (Income_Group)%>%
  summarise(avg_amount_paid=mean(amount_paid,na.rm=TRUE),.groups='drop')
print(average_amount)
## # A tibble: 6 × 2
##   Income_Group avg_amount_paid
##   <fct>                  <dbl>
## 1 5000-10000              437.
## 2 10001-20000             483.
## 3 20001-30000             479.
## 4 30001-40000             594.
## 5 40001-50000             532.
## 6 <NA>                    532.
# visualize output above
ggplot(data=average_amount,mapping=aes(x=Income_Group,y=avg_amount_paid,fill=Income_Group))+
  geom_col(color='black')+theme(axis.text.x = element_text(angle=45,hjust=1))+
  labs(title='Average amount Vs Income')

# What is the distribution of service types provided in after-sales support?
ggplot(data=after_sales,mapping=aes(x=service_type))+
  geom_bar(binwidth=5, fill='lightblue',color='black')+
  labs(title='Distribution of Service Types')
## Warning in geom_bar(binwidth = 5, fill = "lightblue", color = "black"):
## Ignoring unknown parameters: `binwidth`

# Is there a relationship between product type and the type of after-sales service requested?

# step1: join the sales and after sales table
  sales_after_sales <- left_join(sales,after_sales,by='sale_id')
  
# step2: establish relationship
  relationship <- sales_after_sales %>%
    filter(!is.na(product_type) & !is.na(service_type))%>%
    group_by(product_type,service_type)%>%
    summarise(service_count=n(),.groups='drop')
   
  print(relationship) 
## # A tibble: 30 × 3
##    product_type            service_type       service_count
##    <chr>                   <chr>                      <int>
##  1 "  Solar Home System  " "  Maintenance  "              1
##  2 "  Solar Home System  " "Installation"                 2
##  3 "  Solar Home System  " "Repair"                       1
##  4 "  Solar Lantern  "     "  Installation  "             2
##  5 "  Solar Lantern  "     "  Maintenance  "              3
##  6 "  Solar Lantern  "     "Installation"                 8
##  7 "  Solar Lantern  "     "Maintenance"                  2
##  8 "  Solar Lantern  "     "Repair"                       1
##  9 "  Solar Water Pump  "  "  Maintenance  "              1
## 10 "  Solar Water Pump  "  "Installation"                 2
## # ℹ 20 more rows
#step3:visualize the results above
  ggplot(relationship, aes(x = product_type, y = service_type, fill = service_count)) +
    geom_tile(color = 'white') +
    scale_fill_gradient(low = "lightblue", high = "blue") +
    labs(title = 'Product Types vs After-Sales Service Types') +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))

# What is the gender distribution of clients across different payment models?

# step1: create a client sales table
client_sales <- left_join(clients,sales,by='client_id')

# step2: calculate the client count across payment models
gender_distribution <- client_sales %>%
  filter(!is.na(pay_model))%>%
  group_by(pay_model, gender)%>%
  summarise(gender_count=n(),.groups='drop')
print(gender_distribution)
## # A tibble: 6 × 3
##   pay_model   gender gender_count
##   <chr>       <chr>         <int>
## 1 Direct Cash Female           30
## 2 Direct Cash Male             29
## 3 Direct Cash Other            23
## 4 PAYGO       Female           16
## 5 PAYGO       Male             21
## 6 PAYGO       Other            31
# step2: visualize the distribution
ggplot(gender_distribution,aes(x=pay_model,y=gender_count,fill=gender))+
  geom_col(color='black')+labs(title='Gender Distribution by Payment Model')

# What are the peak months or seasons for payments?

# step1:calculate the peak sales period
peak_payments <- payments %>%
  filter(!is.na(payment_date))%>%
  group_by (payment_date)%>%
  summarise(total_amount=sum(amount_paid),.group='drop')%>%
  arrange(desc(total_amount))%>%
  slice(1:5)
print(peak_payments)
## # A tibble: 5 × 3
##   payment_date total_amount .group
##   <chr>               <dbl> <chr> 
## 1 2022-02-27           978. drop  
## 2 2022-03-12           965. drop  
## 3 2022-05-12           953  drop  
## 4 2022-02-12           941. drop  
## 5 2022-03-31           908. drop
# how does product category usage vary by marital status

#re-load tidyverse and dplyr
library(tidyverse)
library(dplyr)

#step1: join the sales table and clients table
sales_clients <- left_join(sales,clients,by='client_id')

#step2: aggregate product category and group by marital status
category_usage <- sales_clients %>%
  group_by(marital_status,product_category)%>%
  summarise(category_count=n(),.groups='drop')%>%
  mutate(percentage_category=category_count/sum(category_count)*100)
print(category_usage)
## # A tibble: 29 × 4
##    marital_status product_category     category_count percentage_category
##    <chr>          <chr>                         <int>               <dbl>
##  1 "  Divorced  " "  Home Use  "                    1               0.667
##  2 "  Divorced  " "  Productive Use  "              1               0.667
##  3 "  Divorced  " "Home Use"                        3               2    
##  4 "  Married  "  "Home Use"                        1               0.667
##  5 "  Married  "  "Productive Use"                  2               1.33 
##  6 "  Single  "   "  Home Use  "                    3               2    
##  7 "  Single  "   "  Productive Use  "              2               1.33 
##  8 "  Single  "   "Home Use"                        7               4.67 
##  9 "  Single  "   "Productive Use"                  9               6    
## 10 "  Widowed  "  "  Home Use  "                    4               2.67 
## # ℹ 19 more rows
# visualize output
ggplot(category_usage,aes(x=product_category,y=marital_status,fill=category_count))+
  geom_tile(color='white')+
  scale_fill_gradient(low='lightgreen',high='darkgreen')+
  labs(title='Category Usage by Marital Status')+
  theme(axis.text.x = element_text(angle=45,hjust=1))

# How do default rates vary by the different pay models?
default_rates <-sales_payments %>%
  filter(!is.na(defaulted))%>%
  group_by(pay_model,defaulted)%>%
  summarise(default_count=n(),.groups='drop')%>%
  mutate(default_rate=default_count/sum(default_count)*100)
print(default_rates)
## # A tibble: 4 × 4
##   pay_model   defaulted default_count default_rate
##   <chr>       <chr>             <int>        <dbl>
## 1 Direct Cash No                   41         27.3
## 2 Direct Cash Yes                  39         26  
## 3 PAYGO       No                   26         17.3
## 4 PAYGO       Yes                  44         29.3
# visualize output
ggplot(default_rates,aes(x=pay_model,y=default_rate,fill=defaulted))+
  geom_col() + labs(title='Default rate by Pay Model')