- Unsupervised learning to find hourly utilization using K-means Clustering
- Predicting GPRS data usage as a function of sms and calls using different models
- Prediction model performance validation
The dataset is the result of a computation over the Call Detail Records (CDRs) generated by the Telecom Italia cellular network over the city of Milano. CDRs log the user activity for billing purposes and network management. There are many types of CDRs, for the generation of this dataset we considered those related to the following activities:
https://dandelion.eu/datagems/SpazioDati/telecom-sms-call-internet-mi/resource/
https://dandelion.eu/datagems/SpazioDati/milano-grid/description/
library("ggplot2")
#install.packages("reshape")
library("reshape")
## Warning: package 'reshape' was built under R version 3.4.3
#set working directory
setwd("C:/Users/Manura/OneDrive - Axiata Digital/AM021/Documents/Personal/EDU/UM/WQD Study Material/WQD7004 - Programming for Data Science")
# read Italia cellular network over the city of Milano
cdr_input <- read.csv('sms-call-internet-mi-2013-12-01.txt',sep="\t",header=F)
#checking the head and tail of the dataset
head(cdr_input)
tail(cdr_input)
# check the number of observations
dim(cdr_input)
## [1] 4438331 8
nrow(cdr_input)
## [1] 4438331
# check the structure of the dataset
str(cdr_input)
## 'data.frame': 4438331 obs. of 8 variables:
## $ V1: int 1 1 1 1 1 1 1 1 1 1 ...
## $ V2: num 1.39e+12 1.39e+12 1.39e+12 1.39e+12 1.39e+12 ...
## $ V3: int 39 46 39 0 39 39 39 46 39 0 ...
## $ V4: num 0.111 NA 0.1651 0.0291 0.1865 ...
## $ V5: num 0.1662 NA 0.1764 0.0273 0.1366 ...
## $ V6: num 0.1092 NA 0.0309 NA 0.0546 ...
## $ V7: num 0.1644 NA 0.0273 NA NA ...
## $ V8: num 13.6484 0.0261 13.3309 NA 11.3296 ...
# Statistical Summary of data
summary(cdr_input)
## V1 V2 V3 V4
## Min. : 1 Min. :1.386e+12 Min. : 0.0 Min. : 0.0
## 1st Qu.: 2977 1st Qu.:1.386e+12 1st Qu.: 1.0 1st Qu.: 0.1
## Median : 5259 Median :1.386e+12 Median : 39.0 Median : 0.5
## Mean : 5161 Mean :1.386e+12 Mean : 197.3 Mean : 1.7
## 3rd Qu.: 7421 3rd Qu.:1.386e+12 3rd Qu.: 44.0 3rd Qu.: 1.6
## Max. :10000 Max. :1.386e+12 Max. :97259.0 Max. :277.4
## NA's :1806830
## V5 V6 V7 V8
## Min. : 0.0 Min. : 0.0 Min. : 0.0 Min. : 0.0
## 1st Qu.: 0.1 1st Qu.: 0.1 1st Qu.: 0.1 1st Qu.: 0.2
## Median : 0.5 Median : 0.5 Median : 0.3 Median : 9.6
## Mean : 1.7 Mean : 1.7 Mean : 1.4 Mean : 36.6
## 3rd Qu.: 1.6 3rd Qu.: 1.7 3rd Qu.: 1.1 3rd Qu.: 36.3
## Max. :204.9 Max. :244.3 Max. :239.0 Max. :6946.2
## NA's :2904395 NA's :3017607 NA's :2355041 NA's :2239881
# rename the dataframe columns
colnames(cdr_input) <- c("square_id","time_interval","country_code","sms_in_activity","sms_out_activity","call_in_activity","call_out_activity","internet_traffic_activity")
# subset the input dataframe for the 500 square_ids
cdr_input_subset_df <- subset(cdr_input,cdr_input$square_id<=500)
head(cdr_input_subset_df)
dim(cdr_input_subset_df)
## [1] 185107 8
str(cdr_input_subset_df)
## 'data.frame': 185107 obs. of 8 variables:
## $ square_id : int 1 1 1 1 1 1 1 1 1 1 ...
## $ time_interval : num 1.39e+12 1.39e+12 1.39e+12 1.39e+12 1.39e+12 ...
## $ country_code : int 39 46 39 0 39 39 39 46 39 0 ...
## $ sms_in_activity : num 0.111 NA 0.1651 0.0291 0.1865 ...
## $ sms_out_activity : num 0.1662 NA 0.1764 0.0273 0.1366 ...
## $ call_in_activity : num 0.1092 NA 0.0309 NA 0.0546 ...
## $ call_out_activity : num 0.1644 NA 0.0273 NA NA ...
## $ internet_traffic_activity: num 13.6484 0.0261 13.3309 NA 11.3296 ...
summary(cdr_input_subset_df)
## square_id time_interval country_code sms_in_activity
## Min. : 1.0 Min. :1.386e+12 Min. : 0 Min. : 0.00
## 1st Qu.:127.0 1st Qu.:1.386e+12 1st Qu.: 0 1st Qu.: 0.06
## Median :251.0 Median :1.386e+12 Median : 39 Median : 0.15
## Mean :251.3 Mean :1.386e+12 Mean : 243 Mean : 0.29
## 3rd Qu.:375.0 3rd Qu.:1.386e+12 3rd Qu.: 41 3rd Qu.: 0.36
## Max. :500.0 Max. :1.386e+12 Max. :88239 Max. :23.82
## NA's :63620
## sms_out_activity call_in_activity call_out_activity
## Min. : 0.00 Min. :0.00 Min. :0.00
## 1st Qu.: 0.05 1st Qu.:0.04 1st Qu.:0.03
## Median : 0.15 Median :0.15 Median :0.11
## Mean : 0.26 Mean :0.27 Mean :0.25
## 3rd Qu.: 0.32 3rd Qu.:0.36 3rd Qu.:0.32
## Max. :11.16 Max. :7.05 Max. :7.94
## NA's :113829 NA's :119425 NA's :104950
## internet_traffic_activity
## Min. : 0.00
## 1st Qu.: 0.07
## Median : 5.34
## Mean : 8.95
## 3rd Qu.: 11.91
## Max. :341.95
## NA's :83074
#creating backup
cdr_input_subset_df_bkp <- cdr_input_subset_df
#cdr_input_subset_df <- cdr_input_subset_df_bkp
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:reshape':
##
## rename
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#cdr_input_subset_df_1 <- cdr_input_subset_df%>%group_by(country_code)%>%mutate(funs(replace(.,which(is.na(.)),mean(., na.rm=TRUE))))
#summary(cdr_input_subset_df_1)
cdr_input_subset_df_cln <- cdr_input_subset_df%>% group_by(country_code) %>%mutate(sms_in_activity = ifelse(is.na(sms_in_activity),as.integer(mean(sms_in_activity, na.rm = TRUE)), sms_in_activity))
cdr_input_subset_df_cln <- cdr_input_subset_df_cln %>% mutate(sms_in_activity = ifelse(is.na(sms_in_activity),0, sms_in_activity))
cdr_input_subset_df_cln <- cdr_input_subset_df_cln%>% group_by(country_code) %>%mutate(sms_out_activity = ifelse(is.na(sms_out_activity),as.integer(mean(sms_out_activity, na.rm = TRUE)), sms_out_activity))
cdr_input_subset_df_cln <- cdr_input_subset_df_cln %>% mutate(sms_out_activity = ifelse(is.na(sms_out_activity),0, sms_out_activity))
cdr_input_subset_df_cln <- cdr_input_subset_df_cln%>% group_by(country_code) %>%mutate(call_in_activity = ifelse(is.na(call_in_activity),as.integer(mean(call_in_activity, na.rm = TRUE)), call_in_activity))
cdr_input_subset_df_cln <- cdr_input_subset_df_cln %>% mutate(call_in_activity = ifelse(is.na(call_in_activity),0, call_in_activity))
cdr_input_subset_df_cln <- cdr_input_subset_df_cln%>% group_by(country_code) %>%mutate(call_out_activity = ifelse(is.na(call_out_activity),as.integer(mean(call_out_activity, na.rm = TRUE)), call_out_activity))
cdr_input_subset_df_cln <- cdr_input_subset_df_cln %>% mutate(call_out_activity = ifelse(is.na(call_out_activity),0, call_out_activity))
summary(cdr_input_subset_df_cln)
## square_id time_interval country_code sms_in_activity
## Min. : 1.0 Min. :1.386e+12 Min. : 0 Min. : 0.0000
## 1st Qu.:127.0 1st Qu.:1.386e+12 1st Qu.: 0 1st Qu.: 0.0000
## Median :251.0 Median :1.386e+12 Median : 39 Median : 0.0532
## Mean :251.3 Mean :1.386e+12 Mean : 243 Mean : 0.1870
## 3rd Qu.:375.0 3rd Qu.:1.386e+12 3rd Qu.: 41 3rd Qu.: 0.2292
## Max. :500.0 Max. :1.386e+12 Max. :88239 Max. :23.8215
##
## sms_out_activity call_in_activity call_out_activity
## Min. : 0.00000 Min. :0.00000 Min. :0.00000
## 1st Qu.: 0.00000 1st Qu.:0.00000 1st Qu.:0.00000
## Median : 0.00000 Median :0.00000 Median :0.00000
## Mean : 0.09989 Mean :0.09713 Mean :0.10755
## 3rd Qu.: 0.08232 3rd Qu.:0.06382 3rd Qu.:0.07679
## Max. :11.15905 Max. :7.04975 Max. :7.93986
##
## internet_traffic_activity
## Min. : 0.00
## 1st Qu.: 0.07
## Median : 5.34
## Mean : 8.95
## 3rd Qu.: 11.91
## Max. :341.95
## NA's :83074
summary(cdr_input_subset_df)
## square_id time_interval country_code sms_in_activity
## Min. : 1.0 Min. :1.386e+12 Min. : 0 Min. : 0.00
## 1st Qu.:127.0 1st Qu.:1.386e+12 1st Qu.: 0 1st Qu.: 0.06
## Median :251.0 Median :1.386e+12 Median : 39 Median : 0.15
## Mean :251.3 Mean :1.386e+12 Mean : 243 Mean : 0.29
## 3rd Qu.:375.0 3rd Qu.:1.386e+12 3rd Qu.: 41 3rd Qu.: 0.36
## Max. :500.0 Max. :1.386e+12 Max. :88239 Max. :23.82
## NA's :63620
## sms_out_activity call_in_activity call_out_activity
## Min. : 0.00 Min. :0.00 Min. :0.00
## 1st Qu.: 0.05 1st Qu.:0.04 1st Qu.:0.03
## Median : 0.15 Median :0.15 Median :0.11
## Mean : 0.26 Mean :0.27 Mean :0.25
## 3rd Qu.: 0.32 3rd Qu.:0.36 3rd Qu.:0.32
## Max. :11.16 Max. :7.05 Max. :7.94
## NA's :113829 NA's :119425 NA's :104950
## internet_traffic_activity
## Min. : 0.00
## 1st Qu.: 0.07
## Median : 5.34
## Mean : 8.95
## 3rd Qu.: 11.91
## Max. :341.95
## NA's :83074
factorColumns <- c("square_id","country_code")
cdr_input_subset_df_cln[factorColumns] <- lapply(cdr_input_subset_df_cln[factorColumns],as.factor)
head(cdr_input_subset_df_cln)
val <- cdr_input_subset_df_cln$time_interval/1000
cdr_input_subset_df_cln$outputTime <- as.POSIXct(val, origin="1970-01-01",tz="UTC")
head(cdr_input_subset_df_cln)
# derive activity start date and hour from time interval
cdr_input_subset_df_cln$activity_start_time <- cdr_input_subset_df_cln$outputTime
#cdr_input_subset_df$activity_date <- as.Date(as.POSIXct(cdr_input_subset_df$activity_start_time,origin="1970-01-01"))
cdr_input_subset_df_cln$activity_date <- as.Date(cdr_input_subset_df_cln$activity_start_time)
cdr_input_subset_df_cln$activity_time <- format(cdr_input_subset_df_cln$activity_start_time,"%H")
# derive total activity from sms in and out, call in and out and internet traffic activity
cdr_input_subset_df_cln$total_activity <- rowSums(cdr_input_subset_df_cln[, c(4,5,6,7,8)],na.rm=T)
totalActivityDF <- aggregate(total_activity ~ activity_time,cdr_input_subset_df_cln,FUN=sum)
totalActivityPlot <- ggplot(data=totalActivityDF, aes(x=activity_time, y=total_activity)) + geom_bar(stat="identity")+theme_bw()
print(totalActivityPlot)
totalGridActivityDF <- aggregate(total_activity ~ square_id,cdr_input_subset_df_cln,FUN=sum)
# sort the data frame by total acitivity descending
totalGridActivityDF <- totalGridActivityDF[order(-totalGridActivityDF$total_activity),]
totalGridActivitySubset <- head(totalGridActivityDF,25)
# top 100 grids by total acitivity
totalGridActivityPlot <- ggplot(data=totalGridActivitySubset, aes(x=reorder(square_id,-total_activity), y=total_activity)) + geom_bar(stat="identity")+theme_bw()
print(totalGridActivityPlot)
# dummy count variable
cdr_input_subset_df_cln$count <- 1
# Sum the total number of acitivity by country code
totalActivityCountryDF <- aggregate(count ~ country_code,cdr_input_subset_df_cln,FUN=sum)
# sort the data frame by
totalActivityCountryDF <- totalActivityCountryDF[order(-totalActivityCountryDF$count),]
totalActivityCountryPlot <- ggplot(data=head(totalActivityCountryDF,20), aes(x=factor(1), fill=country_code)) + geom_bar(width=1)+coord_polar("y")
print(totalActivityCountryPlot)
# subset the required columns
cdrActivityDF <- subset(cdr_input_subset_df_cln,select=c("activity_time","total_activity"))
# find the vector of sum of squared error(SSE) for each cluster selection
wcss <- vector()
for(i in 1:20) wcss[i] = sum(kmeans(cdrActivityDF,i)$withinss)
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
# form the dataframe with cluster and its SSE
clusterFrame <- data.frame(withinss=wcss,Cluster=seq(1:20))
# plot the result and selects the optimal cluster number by looking at the graph. When number of cluster increases then the SSE will be reduced.
ggplot(data=clusterFrame, aes(x=Cluster, y=withinss, group=1)) + geom_line(colour="red", size=1.5) + geom_point(colour="red", size=4, shape=21, fill="white")+theme_bw()
# subset the required columns
cdrActivityDF <- subset(cdrActivityDF,select=c("activity_time","total_activity"))
# Applying k-means on acitivity hours and total_activity
cdrClusterModel <- kmeans(cdrActivityDF,10,nstart=10)
## Warning: did not converge in 10 iterations
# print the summary of the model
print(summary(cdrClusterModel))
## Length Class Mode
## cluster 185107 -none- numeric
## centers 20 -none- numeric
## totss 1 -none- numeric
## withinss 10 -none- numeric
## tot.withinss 1 -none- numeric
## betweenss 1 -none- numeric
## size 10 -none- numeric
## iter 1 -none- numeric
## ifault 1 -none- numeric
# Append the identified cluster to the input dataframe
cdrActivityDF$cluster <- as.factor(cdrClusterModel$cluster)
cdrActivityDF$activity_time <- as.factor(cdrActivityDF$activity_time)
# melt the dataframe
cdrActivityDF.melt <- melt(cdrActivityDF)
## Using activity_time, cluster as id variables
# heat map plot for the clustering results
cdrActivityClusterPlot <- ggplot(cdrActivityDF.melt, aes(activity_time,cluster)) + geom_tile(aes(fill = value))+ scale_fill_gradient(low = "#FBCAC5",high = "#D66889")+theme_bw()
print(cdrActivityClusterPlot)
library(dplyr)
library(tidyr)
##
## Attaching package: 'tidyr'
## The following objects are masked from 'package:reshape':
##
## expand, smiths
## Using elbow method to find the optimum number of clusters
# subset the required columns
cdr_cluster <- subset(cdr_input_subset_df_cln,select=c("country_code",
"sms_in_activity",
"sms_out_activity",
"call_in_activity",
"call_out_activity",
"internet_traffic_activity"
))
## cleaning na
cdr_cluster <- cdr_cluster%>% drop_na(internet_traffic_activity)%>% drop_na(call_out_activity)%>% drop_na(sms_out_activity)
# Using the elbow method to find the optimal number of clusters
# install.packages('caTools')
# library(caTools)
set.seed(6)
wcss = vector()
for (i in 1:10) wcss[i] = sum(kmeans(cdr_cluster, i)$withinss)
# plot(1:10,
# wcss,
# type = 'b',
# main = paste('The Elbow Method'),
# xlab = 'Number of clusters',
# ylab = 'WCSS')
# Fitting K-Means to the dataset
set.seed(29)
kmeans = kmeans(x = cdr_cluster, centers = 10)
y_kmeans = kmeans$cluster
# Visualising the clusters
library(cluster)
clusplot(cdr_cluster[c("country_code", "sms_in_activity")],
y_kmeans,
lines = 0,
shade = TRUE,
color = TRUE,
labels = 6,
plotchar = FALSE,
span = TRUE,
main = paste('Clusters of CDR Data'),
xlab = 'country_code',
ylab = 'sms_in_activity')
clusplot(cdr_cluster[c("call_in_activity", "internet_traffic_activity")],
y_kmeans,
lines = 0,
shade = TRUE,
color = TRUE,
labels = 6,
plotchar = FALSE,
span = TRUE,
main = paste('Clusters of CDR Data'),
xlab = 'call_in_activity',
ylab = 'internet_traffic_activity')
#importing caret package
library('caret')
## Loading required package: lattice
#importing tidyr package
library('tidyr')
#Creating subset for prediction from cdr_input_subset_df_cln
cdr_input_subset_df_cln_P <- subset(cdr_input_subset_df_cln,select=c(
#"activity_start_time",
"activity_time",
"sms_in_activity",
"sms_out_activity",
"call_in_activity",
"call_out_activity",
"internet_traffic_activity"))
# Splitting non NA values for prediction
cdr_input_subset_df_cln_P <- cdr_input_subset_df_cln_P%>% drop_na(internet_traffic_activity)%>% drop_na(call_out_activity)%>% drop_na(sms_out_activity)
#set seed for reproducibility
set.seed(42)
#model_1 using linear regression
# Fit lm model using 5 x 5-fold CV: model internet_traffic_activity as a function of all the varibales
model_1 <- train(internet_traffic_activity ~ .,
cdr_input_subset_df_cln_P,method = "lm",
trControl = trainControl(
method = "cv", number = 10,repeats = 5, verboseIter = TRUE
)
)
## Warning: `repeats` has no meaning for this resampling method.
## + Fold01: intercept=TRUE
## - Fold01: intercept=TRUE
## + Fold02: intercept=TRUE
## - Fold02: intercept=TRUE
## + Fold03: intercept=TRUE
## - Fold03: intercept=TRUE
## + Fold04: intercept=TRUE
## - Fold04: intercept=TRUE
## + Fold05: intercept=TRUE
## - Fold05: intercept=TRUE
## + Fold06: intercept=TRUE
## - Fold06: intercept=TRUE
## + Fold07: intercept=TRUE
## - Fold07: intercept=TRUE
## + Fold08: intercept=TRUE
## - Fold08: intercept=TRUE
## + Fold09: intercept=TRUE
## - Fold09: intercept=TRUE
## + Fold10: intercept=TRUE
## - Fold10: intercept=TRUE
## Aggregating results
## Fitting final model on full training set
#print model_1 results
print(model_1)
## Linear Regression
##
## 102033 samples
## 5 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 91830, 91829, 91830, 91829, 91829, 91829, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 9.565461 0.6047386 4.340469
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
# Assign predicted values to a new varibale
cdr_input_subset_df_cln_P$internet_traffic_p1 <- predict(model_1,cdr_input_subset_df_cln_P)
library(ggplot2)
# ggplot(cdr_input_subset_df_cln_P, aes(x = activity_time)) +
# geom_line(aes(y = cdr_input_subset_df_cln_P$internet_traffic_activity), colour = "grey") +
# geom_line(aes(y = cdr_input_subset_df_cln_P$internet_traffic_p1), colour="blue") +
# ylab(label="Data Traffic") +
# xlab("Activity Time")
#
#
#
# ggplot(cdr_input_subset_df_cln_P, aes(x = activity_time)) +
# geom_point(aes(y = cdr_input_subset_df_cln_P$internet_traffic_activity), colour = "green") +
# geom_bar(aes(y = cdr_input_subset_df_cln_P$internet_traffic_p1), colour="blue", stat="identity")
fill1 <- "#4271AE"
line1 <- "#1F3552"
fill2 <- "gold1"
line2 <- "goldenrod2"
ggplot(cdr_input_subset_df_cln_P, aes(x = activity_time)) +
geom_boxplot(aes(y = cdr_input_subset_df_cln_P$internet_traffic_activity),
fill = fill1, colour = line1) +
geom_point(aes(y = cdr_input_subset_df_cln_P$internet_traffic_p1), colour = line2) +
ylab(label="Data Traffic") +
xlab("Activity Time")
#model_2 using Bayesian Generalized Linear Model
# Fit lm model using 5 x 5-fold CV: model internet_traffic_activity as a function of all the varibales
#install.packages('arm')
library(arm)
## Warning: package 'arm' was built under R version 3.4.3
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following object is masked from 'package:tidyr':
##
## expand
## The following object is masked from 'package:reshape':
##
## expand
## Loading required package: lme4
## Warning: package 'lme4' was built under R version 3.4.3
##
## arm (Version 1.9-3, built: 2016-11-21)
## Working directory is C:/Users/Manura/OneDrive - Axiata Digital/AM021/Documents/Personal/EDU/UM/WQD Study Material/WQD7004 - Programming for Data Science
library(abind)
model_2 <- train(internet_traffic_activity ~ .,
cdr_input_subset_df_cln_P,method = 'bayesglm',
trControl = trainControl(
method = "cv", number = 10,repeats = 5, verboseIter = TRUE
)
)
## Warning: `repeats` has no meaning for this resampling method.
## + Fold01: parameter=none
## - Fold01: parameter=none
## + Fold02: parameter=none
## - Fold02: parameter=none
## + Fold03: parameter=none
## - Fold03: parameter=none
## + Fold04: parameter=none
## - Fold04: parameter=none
## + Fold05: parameter=none
## - Fold05: parameter=none
## + Fold06: parameter=none
## - Fold06: parameter=none
## + Fold07: parameter=none
## - Fold07: parameter=none
## + Fold08: parameter=none
## - Fold08: parameter=none
## + Fold09: parameter=none
## - Fold09: parameter=none
## + Fold10: parameter=none
## - Fold10: parameter=none
## Aggregating results
## Fitting final model on full training set
#print model_1 results
print(model_2)
## Bayesian Generalized Linear Model
##
## 102033 samples
## 6 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 91829, 91831, 91829, 91829, 91830, 91830, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 9.567901 0.6043231 4.340832
# Assign predicted values to a new varibale
cdr_input_subset_df_cln_P$internet_traffic_p2 <- predict(model_2,cdr_input_subset_df_cln_P)
#library(ggplot2)
fill1 <- "#4271AE"
line1 <- "#1F3552"
fill2 <- "gold1"
line2 <- "goldenrod2"
ggplot(cdr_input_subset_df_cln_P, aes(x = activity_time)) +
geom_boxplot(aes(y = cdr_input_subset_df_cln_P$internet_traffic_activity),
fill = fill1, colour = line1) +
geom_point(aes(y = cdr_input_subset_df_cln_P$internet_traffic_p2), colour = line2) +
ylab(label="Data Traffic") +
xlab("Activity Time")