Objectives : Call Detail Record Analysis - Predicting GPRS usage for Mobile Operators

            - Unsupervised learning to find hourly utilization using K-means Clustering 
            - Predicting GPRS data usage as a function of sms and calls using different models
            - Prediction model performance validation 

Description of Data

The dataset is the result of a computation over the Call Detail Records (CDRs) generated by the Telecom Italia cellular network over the city of Milano. CDRs log the user activity for billing purposes and network management. There are many types of CDRs, for the generation of this dataset we considered those related to the following activities:

  1. Square id: The id of the square that is part of the city GRID.
  2. Time interval: The beginning of the time interval expressed as the number of millisecond elapsed from the Unix Epoch on January 1st, 1970 at UTC. The end of the time interval can be obtained by adding 600000 milliseconds (10 minutes) to this value.
  3. Country code: The phone country code of a nation. Depending on the measured activity this value assumes different meanings that are explained later.
  4. SMS-in activity: The activity in terms of received SMS inside the Square id, during the Time interval and sent from the nation identified by the Country code.
  5. SMS-out activity: The activity in terms of sent SMS inside the Square id, during the Time interval and received by the nation identified by the Country code.
  6. Call-in activity: The activity in terms of received calls inside the Square id, during the Time interval and issued from the nation identified by the Country code.
  7. Call-out activity: The activity in terms of issued calls inside the Square id, during the Time interval and received by the nation identified by the Country code.
  8. Internet traffic activity: The activity in terms of performed internet traffic inside the Square id, during the Time interval and by the nation of the users performing the connection identified by the Country code.

https://dandelion.eu/datagems/SpazioDati/telecom-sms-call-internet-mi/resource/

https://dandelion.eu/datagems/SpazioDati/milano-grid/description/

library("ggplot2")
#install.packages("reshape")
library("reshape")
## Warning: package 'reshape' was built under R version 3.4.3
  #set working directory 
  setwd("C:/Users/Manura/OneDrive - Axiata Digital/AM021/Documents/Personal/EDU/UM/WQD Study Material/WQD7004  -  Programming for Data Science")
  
  

  # read Italia cellular network over the city of Milano
    cdr_input <- read.csv('sms-call-internet-mi-2013-12-01.txt',sep="\t",header=F)
    
    #checking the head and tail of the dataset 
    head(cdr_input)
    tail(cdr_input)
    # check the number of observations
    dim(cdr_input)
## [1] 4438331       8
    nrow(cdr_input)
## [1] 4438331
    # check the structure of the dataset
    str(cdr_input)
## 'data.frame':    4438331 obs. of  8 variables:
##  $ V1: int  1 1 1 1 1 1 1 1 1 1 ...
##  $ V2: num  1.39e+12 1.39e+12 1.39e+12 1.39e+12 1.39e+12 ...
##  $ V3: int  39 46 39 0 39 39 39 46 39 0 ...
##  $ V4: num  0.111 NA 0.1651 0.0291 0.1865 ...
##  $ V5: num  0.1662 NA 0.1764 0.0273 0.1366 ...
##  $ V6: num  0.1092 NA 0.0309 NA 0.0546 ...
##  $ V7: num  0.1644 NA 0.0273 NA NA ...
##  $ V8: num  13.6484 0.0261 13.3309 NA 11.3296 ...
    # Statistical Summary of data 
    summary(cdr_input)
##        V1              V2                  V3                V4         
##  Min.   :    1   Min.   :1.386e+12   Min.   :    0.0   Min.   :  0.0    
##  1st Qu.: 2977   1st Qu.:1.386e+12   1st Qu.:    1.0   1st Qu.:  0.1    
##  Median : 5259   Median :1.386e+12   Median :   39.0   Median :  0.5    
##  Mean   : 5161   Mean   :1.386e+12   Mean   :  197.3   Mean   :  1.7    
##  3rd Qu.: 7421   3rd Qu.:1.386e+12   3rd Qu.:   44.0   3rd Qu.:  1.6    
##  Max.   :10000   Max.   :1.386e+12   Max.   :97259.0   Max.   :277.4    
##                                                        NA's   :1806830  
##        V5                V6                V7                V8         
##  Min.   :  0.0     Min.   :  0.0     Min.   :  0.0     Min.   :   0.0   
##  1st Qu.:  0.1     1st Qu.:  0.1     1st Qu.:  0.1     1st Qu.:   0.2   
##  Median :  0.5     Median :  0.5     Median :  0.3     Median :   9.6   
##  Mean   :  1.7     Mean   :  1.7     Mean   :  1.4     Mean   :  36.6   
##  3rd Qu.:  1.6     3rd Qu.:  1.7     3rd Qu.:  1.1     3rd Qu.:  36.3   
##  Max.   :204.9     Max.   :244.3     Max.   :239.0     Max.   :6946.2   
##  NA's   :2904395   NA's   :3017607   NA's   :2355041   NA's   :2239881
    # rename the dataframe columns
    colnames(cdr_input) <- c("square_id","time_interval","country_code","sms_in_activity","sms_out_activity","call_in_activity","call_out_activity","internet_traffic_activity")
        

    # subset the input dataframe for the 500 square_ids
    cdr_input_subset_df <- subset(cdr_input,cdr_input$square_id<=500)
    head(cdr_input_subset_df)
    dim(cdr_input_subset_df)
## [1] 185107      8
    str(cdr_input_subset_df)
## 'data.frame':    185107 obs. of  8 variables:
##  $ square_id                : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ time_interval            : num  1.39e+12 1.39e+12 1.39e+12 1.39e+12 1.39e+12 ...
##  $ country_code             : int  39 46 39 0 39 39 39 46 39 0 ...
##  $ sms_in_activity          : num  0.111 NA 0.1651 0.0291 0.1865 ...
##  $ sms_out_activity         : num  0.1662 NA 0.1764 0.0273 0.1366 ...
##  $ call_in_activity         : num  0.1092 NA 0.0309 NA 0.0546 ...
##  $ call_out_activity        : num  0.1644 NA 0.0273 NA NA ...
##  $ internet_traffic_activity: num  13.6484 0.0261 13.3309 NA 11.3296 ...
    summary(cdr_input_subset_df)
##    square_id     time_interval        country_code   sms_in_activity
##  Min.   :  1.0   Min.   :1.386e+12   Min.   :    0   Min.   : 0.00  
##  1st Qu.:127.0   1st Qu.:1.386e+12   1st Qu.:    0   1st Qu.: 0.06  
##  Median :251.0   Median :1.386e+12   Median :   39   Median : 0.15  
##  Mean   :251.3   Mean   :1.386e+12   Mean   :  243   Mean   : 0.29  
##  3rd Qu.:375.0   3rd Qu.:1.386e+12   3rd Qu.:   41   3rd Qu.: 0.36  
##  Max.   :500.0   Max.   :1.386e+12   Max.   :88239   Max.   :23.82  
##                                                      NA's   :63620  
##  sms_out_activity call_in_activity call_out_activity
##  Min.   : 0.00    Min.   :0.00     Min.   :0.00     
##  1st Qu.: 0.05    1st Qu.:0.04     1st Qu.:0.03     
##  Median : 0.15    Median :0.15     Median :0.11     
##  Mean   : 0.26    Mean   :0.27     Mean   :0.25     
##  3rd Qu.: 0.32    3rd Qu.:0.36     3rd Qu.:0.32     
##  Max.   :11.16    Max.   :7.05     Max.   :7.94     
##  NA's   :113829   NA's   :119425   NA's   :104950   
##  internet_traffic_activity
##  Min.   :  0.00           
##  1st Qu.:  0.07           
##  Median :  5.34           
##  Mean   :  8.95           
##  3rd Qu.: 11.91           
##  Max.   :341.95           
##  NA's   :83074

Treating missing values

#creating backup 
cdr_input_subset_df_bkp <- cdr_input_subset_df
#cdr_input_subset_df <- cdr_input_subset_df_bkp 


library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:reshape':
## 
##     rename
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
#cdr_input_subset_df_1 <- cdr_input_subset_df%>%group_by(country_code)%>%mutate(funs(replace(.,which(is.na(.)),mean(., na.rm=TRUE))))
#summary(cdr_input_subset_df_1)

cdr_input_subset_df_cln <- cdr_input_subset_df%>% group_by(country_code) %>%mutate(sms_in_activity = ifelse(is.na(sms_in_activity),as.integer(mean(sms_in_activity, na.rm = TRUE)), sms_in_activity))

cdr_input_subset_df_cln  <- cdr_input_subset_df_cln %>% mutate(sms_in_activity = ifelse(is.na(sms_in_activity),0, sms_in_activity))

cdr_input_subset_df_cln <- cdr_input_subset_df_cln%>% group_by(country_code) %>%mutate(sms_out_activity = ifelse(is.na(sms_out_activity),as.integer(mean(sms_out_activity, na.rm = TRUE)), sms_out_activity))

cdr_input_subset_df_cln  <- cdr_input_subset_df_cln %>% mutate(sms_out_activity = ifelse(is.na(sms_out_activity),0, sms_out_activity))


cdr_input_subset_df_cln <- cdr_input_subset_df_cln%>% group_by(country_code) %>%mutate(call_in_activity = ifelse(is.na(call_in_activity),as.integer(mean(call_in_activity, na.rm = TRUE)), call_in_activity))

cdr_input_subset_df_cln  <- cdr_input_subset_df_cln %>% mutate(call_in_activity = ifelse(is.na(call_in_activity),0, call_in_activity))


cdr_input_subset_df_cln <- cdr_input_subset_df_cln%>% group_by(country_code) %>%mutate(call_out_activity = ifelse(is.na(call_out_activity),as.integer(mean(call_out_activity, na.rm = TRUE)), call_out_activity))

cdr_input_subset_df_cln  <- cdr_input_subset_df_cln %>% mutate(call_out_activity = ifelse(is.na(call_out_activity),0, call_out_activity))

summary(cdr_input_subset_df_cln)
##    square_id     time_interval        country_code   sms_in_activity  
##  Min.   :  1.0   Min.   :1.386e+12   Min.   :    0   Min.   : 0.0000  
##  1st Qu.:127.0   1st Qu.:1.386e+12   1st Qu.:    0   1st Qu.: 0.0000  
##  Median :251.0   Median :1.386e+12   Median :   39   Median : 0.0532  
##  Mean   :251.3   Mean   :1.386e+12   Mean   :  243   Mean   : 0.1870  
##  3rd Qu.:375.0   3rd Qu.:1.386e+12   3rd Qu.:   41   3rd Qu.: 0.2292  
##  Max.   :500.0   Max.   :1.386e+12   Max.   :88239   Max.   :23.8215  
##                                                                       
##  sms_out_activity   call_in_activity  call_out_activity
##  Min.   : 0.00000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.: 0.00000   1st Qu.:0.00000   1st Qu.:0.00000  
##  Median : 0.00000   Median :0.00000   Median :0.00000  
##  Mean   : 0.09989   Mean   :0.09713   Mean   :0.10755  
##  3rd Qu.: 0.08232   3rd Qu.:0.06382   3rd Qu.:0.07679  
##  Max.   :11.15905   Max.   :7.04975   Max.   :7.93986  
##                                                        
##  internet_traffic_activity
##  Min.   :  0.00           
##  1st Qu.:  0.07           
##  Median :  5.34           
##  Mean   :  8.95           
##  3rd Qu.: 11.91           
##  Max.   :341.95           
##  NA's   :83074
summary(cdr_input_subset_df)
##    square_id     time_interval        country_code   sms_in_activity
##  Min.   :  1.0   Min.   :1.386e+12   Min.   :    0   Min.   : 0.00  
##  1st Qu.:127.0   1st Qu.:1.386e+12   1st Qu.:    0   1st Qu.: 0.06  
##  Median :251.0   Median :1.386e+12   Median :   39   Median : 0.15  
##  Mean   :251.3   Mean   :1.386e+12   Mean   :  243   Mean   : 0.29  
##  3rd Qu.:375.0   3rd Qu.:1.386e+12   3rd Qu.:   41   3rd Qu.: 0.36  
##  Max.   :500.0   Max.   :1.386e+12   Max.   :88239   Max.   :23.82  
##                                                      NA's   :63620  
##  sms_out_activity call_in_activity call_out_activity
##  Min.   : 0.00    Min.   :0.00     Min.   :0.00     
##  1st Qu.: 0.05    1st Qu.:0.04     1st Qu.:0.03     
##  Median : 0.15    Median :0.15     Median :0.11     
##  Mean   : 0.26    Mean   :0.27     Mean   :0.25     
##  3rd Qu.: 0.32    3rd Qu.:0.36     3rd Qu.:0.32     
##  Max.   :11.16    Max.   :7.05     Max.   :7.94     
##  NA's   :113829   NA's   :119425   NA's   :104950   
##  internet_traffic_activity
##  Min.   :  0.00           
##  1st Qu.:  0.07           
##  Median :  5.34           
##  Mean   :  8.95           
##  3rd Qu.: 11.91           
##  Max.   :341.95           
##  NA's   :83074

convert numeric column into factor column

    factorColumns <- c("square_id","country_code")
    cdr_input_subset_df_cln[factorColumns] <- lapply(cdr_input_subset_df_cln[factorColumns],as.factor)
  head(cdr_input_subset_df_cln)

calculate the datetime from epoch unix time

    val <- cdr_input_subset_df_cln$time_interval/1000
    cdr_input_subset_df_cln$outputTime <- as.POSIXct(val, origin="1970-01-01",tz="UTC")
    head(cdr_input_subset_df_cln)

derive activity start date and hour from time interval

    # derive activity start date and hour from time interval
    cdr_input_subset_df_cln$activity_start_time <- cdr_input_subset_df_cln$outputTime 
    #cdr_input_subset_df$activity_date <- as.Date(as.POSIXct(cdr_input_subset_df$activity_start_time,origin="1970-01-01"))
  cdr_input_subset_df_cln$activity_date <- as.Date(cdr_input_subset_df_cln$activity_start_time)
    cdr_input_subset_df_cln$activity_time <- format(cdr_input_subset_df_cln$activity_start_time,"%H")
    
    # derive total activity from sms in and out, call in and out and internet traffic activity 
    cdr_input_subset_df_cln$total_activity <- rowSums(cdr_input_subset_df_cln[, c(4,5,6,7,8)],na.rm=T)

plotting the activity_time Vs total_activity

    totalActivityDF <- aggregate(total_activity ~ activity_time,cdr_input_subset_df_cln,FUN=sum)
    
    totalActivityPlot <- ggplot(data=totalActivityDF, aes(x=activity_time, y=total_activity)) + geom_bar(stat="identity")+theme_bw()
    
    print(totalActivityPlot)

plotting square_id Vs total_activity

    totalGridActivityDF <- aggregate(total_activity ~ square_id,cdr_input_subset_df_cln,FUN=sum)
    
    # sort the data frame by total acitivity descending
    totalGridActivityDF <- totalGridActivityDF[order(-totalGridActivityDF$total_activity),]
    
    totalGridActivitySubset <- head(totalGridActivityDF,25)
    
    # top 100 grids by total acitivity
    totalGridActivityPlot <- ggplot(data=totalGridActivitySubset, aes(x=reorder(square_id,-total_activity),   y=total_activity)) + geom_bar(stat="identity")+theme_bw()
    
    print(totalGridActivityPlot)

plotting country_code Vs total_activity

    # dummy count variable
    cdr_input_subset_df_cln$count <- 1

    # Sum the total number of acitivity by country code
    totalActivityCountryDF <- aggregate(count ~ country_code,cdr_input_subset_df_cln,FUN=sum)
    
    # sort the data frame by 
    totalActivityCountryDF <- totalActivityCountryDF[order(-totalActivityCountryDF$count),]
    
    totalActivityCountryPlot <- ggplot(data=head(totalActivityCountryDF,20), aes(x=factor(1), fill=country_code)) + geom_bar(width=1)+coord_polar("y")
    
    print(totalActivityCountryPlot)

Using elbow method to find the optimum number of clusters

    # subset the required columns
    cdrActivityDF <- subset(cdr_input_subset_df_cln,select=c("activity_time","total_activity"))
    
    # find the vector of sum of squared error(SSE) for each cluster selection
    wcss <- vector()
    for(i in 1:20) wcss[i] = sum(kmeans(cdrActivityDF,i)$withinss)
## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations
    # form the dataframe with cluster and its SSE
    clusterFrame <- data.frame(withinss=wcss,Cluster=seq(1:20))
    
    # plot the result and selects the optimal cluster number by looking at the graph. When number of cluster increases then the SSE will be reduced.
    ggplot(data=clusterFrame, aes(x=Cluster, y=withinss, group=1)) + geom_line(colour="red", size=1.5) + geom_point(colour="red", size=4, shape=21, fill="white")+theme_bw()

Applying K-Means Clustering for activity time and total activity

    # subset the required columns
    cdrActivityDF <- subset(cdrActivityDF,select=c("activity_time","total_activity"))
    
    # Applying k-means on acitivity hours and total_activity
    cdrClusterModel <- kmeans(cdrActivityDF,10,nstart=10)
## Warning: did not converge in 10 iterations
    # print the summary of the model
    print(summary(cdrClusterModel))
##              Length Class  Mode   
## cluster      185107 -none- numeric
## centers          20 -none- numeric
## totss             1 -none- numeric
## withinss         10 -none- numeric
## tot.withinss      1 -none- numeric
## betweenss         1 -none- numeric
## size             10 -none- numeric
## iter              1 -none- numeric
## ifault            1 -none- numeric
    # Append the identified cluster to the input dataframe
    cdrActivityDF$cluster <- as.factor(cdrClusterModel$cluster)
    cdrActivityDF$activity_time <- as.factor(cdrActivityDF$activity_time)
    
    # melt the dataframe
    cdrActivityDF.melt <- melt(cdrActivityDF)
## Using activity_time, cluster as id variables
    # heat map plot for the clustering results
    cdrActivityClusterPlot <- ggplot(cdrActivityDF.melt, aes(activity_time,cluster)) + geom_tile(aes(fill = value))+ scale_fill_gradient(low = "#FBCAC5",high = "#D66889")+theme_bw()
    print(cdrActivityClusterPlot)

Applying K-Means Clustering for other variables

library(dplyr)
library(tidyr)
## 
## Attaching package: 'tidyr'
## The following objects are masked from 'package:reshape':
## 
##     expand, smiths
## Using elbow method to find the optimum number of clusters 
                # subset the required columns
                cdr_cluster <- subset(cdr_input_subset_df_cln,select=c("country_code",
                                                                         "sms_in_activity",
                                                                         "sms_out_activity",
                                                                         "call_in_activity",
                                                                         "call_out_activity",
                                                                        "internet_traffic_activity"
                                                                         ))
## cleaning na 
  cdr_cluster  <- cdr_cluster%>% drop_na(internet_traffic_activity)%>% drop_na(call_out_activity)%>% drop_na(sms_out_activity)
  
# Using the elbow method to find the optimal number of clusters

# install.packages('caTools')
# library(caTools)
set.seed(6)
wcss = vector()
for (i in 1:10) wcss[i] = sum(kmeans(cdr_cluster, i)$withinss)
# plot(1:10,
#      wcss,
#      type = 'b',
#      main = paste('The Elbow Method'),
#      xlab = 'Number of clusters',
#      ylab = 'WCSS')

# Fitting K-Means to the dataset
set.seed(29)
kmeans = kmeans(x = cdr_cluster, centers = 10)
y_kmeans = kmeans$cluster

Visualising K-Means clusters

# Visualising the clusters
library(cluster)
clusplot(cdr_cluster[c("country_code", "sms_in_activity")],
         y_kmeans,
         lines = 0,
         shade = TRUE,
         color = TRUE,
         labels = 6,
         plotchar = FALSE,
         span = TRUE,
         main = paste('Clusters of CDR Data'),
         xlab = 'country_code',
         ylab = 'sms_in_activity')

clusplot(cdr_cluster[c("call_in_activity", "internet_traffic_activity")],
         y_kmeans,
         lines = 0,
         shade = TRUE,
         color = TRUE,
         labels = 6,
         plotchar = FALSE,
         span = TRUE,
         main = paste('Clusters of CDR Data'),
         xlab = 'call_in_activity',
         ylab = 'internet_traffic_activity')

Predicting internet_traffic using sms and call activities

  #importing caret package

  library('caret')
## Loading required package: lattice
  #importing tidyr package 
  
  library('tidyr')



  
  #Creating subset for prediction from cdr_input_subset_df_cln
  
  cdr_input_subset_df_cln_P <- subset(cdr_input_subset_df_cln,select=c(
    
#"activity_start_time", 
                                                                       "activity_time",
                                                                        "sms_in_activity",
                                                                       "sms_out_activity",
                                                                       "call_in_activity", 
                                                                       "call_out_activity",
                                                                       "internet_traffic_activity"))
  
  # Splitting non NA values for prediction 
  
  cdr_input_subset_df_cln_P <- cdr_input_subset_df_cln_P%>% drop_na(internet_traffic_activity)%>% drop_na(call_out_activity)%>% drop_na(sms_out_activity)

Linear Prediction model one model_1

  #set seed for reproducibility 

  set.seed(42)
  #model_1 using linear regression 
  # Fit lm model using 5 x 5-fold CV: model internet_traffic_activity as a function of all the varibales 
  model_1 <- train(internet_traffic_activity ~ ., 
                 cdr_input_subset_df_cln_P,method = "lm",
                 trControl = trainControl(
                   method = "cv", number = 10,repeats = 5, verboseIter = TRUE
  )
)
## Warning: `repeats` has no meaning for this resampling method.
## + Fold01: intercept=TRUE 
## - Fold01: intercept=TRUE 
## + Fold02: intercept=TRUE 
## - Fold02: intercept=TRUE 
## + Fold03: intercept=TRUE 
## - Fold03: intercept=TRUE 
## + Fold04: intercept=TRUE 
## - Fold04: intercept=TRUE 
## + Fold05: intercept=TRUE 
## - Fold05: intercept=TRUE 
## + Fold06: intercept=TRUE 
## - Fold06: intercept=TRUE 
## + Fold07: intercept=TRUE 
## - Fold07: intercept=TRUE 
## + Fold08: intercept=TRUE 
## - Fold08: intercept=TRUE 
## + Fold09: intercept=TRUE 
## - Fold09: intercept=TRUE 
## + Fold10: intercept=TRUE 
## - Fold10: intercept=TRUE 
## Aggregating results
## Fitting final model on full training set
  #print model_1 results 
  
  print(model_1)
## Linear Regression 
## 
## 102033 samples
##      5 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 91830, 91829, 91830, 91829, 91829, 91829, ... 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   9.565461  0.6047386  4.340469
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE
  # Assign predicted values to a new varibale 
  
  cdr_input_subset_df_cln_P$internet_traffic_p1 <- predict(model_1,cdr_input_subset_df_cln_P)

Plotting actual values and predicted internet traffic

library(ggplot2)

# ggplot(cdr_input_subset_df_cln_P, aes(x = activity_time)) + 
#   geom_line(aes(y = cdr_input_subset_df_cln_P$internet_traffic_activity), colour = "grey") + 
#   geom_line(aes(y = cdr_input_subset_df_cln_P$internet_traffic_p1), colour="blue") + 
#   ylab(label="Data Traffic") + 
#   xlab("Activity Time")
# 
# 
# 
# ggplot(cdr_input_subset_df_cln_P, aes(x = activity_time)) + 
#   geom_point(aes(y = cdr_input_subset_df_cln_P$internet_traffic_activity), colour = "green") + 
#   geom_bar(aes(y = cdr_input_subset_df_cln_P$internet_traffic_p1), colour="blue", stat="identity")

fill1 <- "#4271AE"
line1 <- "#1F3552"
fill2 <- "gold1"
line2 <- "goldenrod2"

ggplot(cdr_input_subset_df_cln_P, aes(x = activity_time)) + 
  geom_boxplot(aes(y = cdr_input_subset_df_cln_P$internet_traffic_activity),
               fill = fill1, colour = line1) +
  geom_point(aes(y = cdr_input_subset_df_cln_P$internet_traffic_p1), colour = line2) +
      ylab(label="Data Traffic") + 
   xlab("Activity Time")

  #model_2 using Bayesian Generalized Linear Model
  # Fit lm model using 5 x 5-fold CV: model internet_traffic_activity as a function of all the varibales 
#install.packages('arm')  

library(arm)
## Warning: package 'arm' was built under R version 3.4.3
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following object is masked from 'package:tidyr':
## 
##     expand
## The following object is masked from 'package:reshape':
## 
##     expand
## Loading required package: lme4
## Warning: package 'lme4' was built under R version 3.4.3
## 
## arm (Version 1.9-3, built: 2016-11-21)
## Working directory is C:/Users/Manura/OneDrive - Axiata Digital/AM021/Documents/Personal/EDU/UM/WQD Study Material/WQD7004  -  Programming for Data Science
library(abind)

model_2 <- train(internet_traffic_activity ~ ., 
                 cdr_input_subset_df_cln_P,method = 'bayesglm',
                 trControl = trainControl(
                   method = "cv", number = 10,repeats = 5, verboseIter = TRUE
  )
)
## Warning: `repeats` has no meaning for this resampling method.
## + Fold01: parameter=none 
## - Fold01: parameter=none 
## + Fold02: parameter=none 
## - Fold02: parameter=none 
## + Fold03: parameter=none 
## - Fold03: parameter=none 
## + Fold04: parameter=none 
## - Fold04: parameter=none 
## + Fold05: parameter=none 
## - Fold05: parameter=none 
## + Fold06: parameter=none 
## - Fold06: parameter=none 
## + Fold07: parameter=none 
## - Fold07: parameter=none 
## + Fold08: parameter=none 
## - Fold08: parameter=none 
## + Fold09: parameter=none 
## - Fold09: parameter=none 
## + Fold10: parameter=none 
## - Fold10: parameter=none 
## Aggregating results
## Fitting final model on full training set
  #print model_1 results 
  
  print(model_2)
## Bayesian Generalized Linear Model 
## 
## 102033 samples
##      6 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 91829, 91831, 91829, 91829, 91830, 91830, ... 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   9.567901  0.6043231  4.340832
  # Assign predicted values to a new varibale 
  
  cdr_input_subset_df_cln_P$internet_traffic_p2 <- predict(model_2,cdr_input_subset_df_cln_P)
#library(ggplot2)

fill1 <- "#4271AE"
line1 <- "#1F3552"
fill2 <- "gold1"
line2 <- "goldenrod2"

ggplot(cdr_input_subset_df_cln_P, aes(x = activity_time)) + 
  geom_boxplot(aes(y = cdr_input_subset_df_cln_P$internet_traffic_activity),
               fill = fill1, colour = line1) +
  geom_point(aes(y = cdr_input_subset_df_cln_P$internet_traffic_p2), colour = line2) +
      ylab(label="Data Traffic") + 
   xlab("Activity Time")