#uploading of dataset into r markdown file
df <- read.csv("data.csv", row.names = NULL)
print(df)
## Data.Series Total.Mobile.Phone.Subscriptions..At.End.Of.Period..Thousand.
## 1 2024 9963.5
## 2 2023 9589.6
## 3 2022 9152.4
## 4 2021 8348.5
## 5 2020 8231.2
## 6 2019 8985.1
## 7 2018 8354
## 8 2017 8452.8
## 9 2016 8398.8
## 10 2015 8211.4
## 11 2014 8093.3
## 12 2013 8420.7
## 13 2012 8063
## 14 2011 7755.2
## 15 2010 7288.6
## 16 2009 6857.2
## 17 2008 6340.2
## 18 2007 5619.2
## 19 2006 4637.7
## 20 2005 4256.8
## 21 2004 3860.6
## 22 2003 3477.1
## 23 2002 3244.8
## 24 2001 2858.8
## 25 2000 2442.1
## 26 1999 1471.3
## 27 1998 1020.2
## 28 1997 743
## 29 1996 404.1
## 30 1995 292
## 31 1994 222
## 32 1993 162
## 33 1992 110
## 34 1991 75
## 35 1990 46
## 36 1989 20
## 37 1988 na
## 38 1987 na
## 39 1986 na
## 40 1985 na
## Total.Post.Paid.Subscriptions..2G...At.End.Of.Period..Thousand.
## 1 na
## 2 na
## 3 na
## 4 na
## 5 na
## 6 na
## 7 na
## 8 na
## 9 30.9
## 10 49.2
## 11 89.8
## 12 117.8
## 13 152.7
## 14 204.5
## 15 289.8
## 16 456.2
## 17 791.6
## 18 1318.1
## 19 1960.1
## 20 2565.1
## 21 2613.8
## 22 na
## 23 na
## 24 na
## 25 na
## 26 na
## 27 na
## 28 na
## 29 na
## 30 na
## 31 na
## 32 na
## 33 na
## 34 na
## 35 na
## 36 na
## 37 na
## 38 na
## 39 na
## 40 na
## Total.Pre.Paid.Subscriptions..2G...At.End.Of.Period..Thousand.
## 1 na
## 2 na
## 3 na
## 4 na
## 5 na
## 6 na
## 7 na
## 8 na
## 9 101.4
## 10 172.4
## 11 213.1
## 12 993.7
## 13 1429.7
## 14 1795.3
## 15 2294.4
## 16 3240.7
## 17 3075
## 18 2600
## 19 1793.7
## 20 1516.9
## 21 1246.9
## 22 na
## 23 na
## 24 na
## 25 na
## 26 na
## 27 na
## 28 na
## 29 na
## 30 na
## 31 na
## 32 na
## 33 na
## 34 na
## 35 na
## 36 na
## 37 na
## 38 na
## 39 na
## 40 na
## Total.Subscriptions..3G...At.End.Of.Period..Thousand.
## 1 na
## 2 64.9
## 3 542.2
## 4 690.1
## 5 769.6
## 6 1651.1
## 7 1787.5
## 8 2349
## 9 3412.1
## 10 3913.5
## 11 4608.8
## 12 5258.3
## 13 6480.6
## 14 5755.4
## 15 4704.5
## 16 3160.3
## 17 2473.6
## 18 1701
## 19 884
## 20 174.8
## 21 na
## 22 na
## 23 na
## 24 na
## 25 na
## 26 na
## 27 na
## 28 na
## 29 na
## 30 na
## 31 na
## 32 na
## 33 na
## 34 na
## 35 na
## 36 na
## 37 na
## 38 na
## 39 na
## 40 na
## Total.Post.Paid.Subscriptions..3G...At.End.Of.Period..Thousand.
## 1 na
## 2 24
## 3 41.2
## 4 80.3
## 5 96.5
## 6 467
## 7 540.5
## 8 650
## 9 855.7
## 10 1163.4
## 11 1674.4
## 12 2393.9
## 13 4113.2
## 14 3825.3
## 15 3472.7
## 16 2989.4
## 17 na
## 18 na
## 19 na
## 20 na
## 21 na
## 22 na
## 23 na
## 24 na
## 25 na
## 26 na
## 27 na
## 28 na
## 29 na
## 30 na
## 31 na
## 32 na
## 33 na
## 34 na
## 35 na
## 36 na
## 37 na
## 38 na
## 39 na
## 40 na
## Total.Pre.Paid.Subscriptions..3G...At.End.Of.Period..Thousand.
## 1 na
## 2 40.9
## 3 501
## 4 609.8
## 5 673.1
## 6 1184.1
## 7 1247
## 8 1699
## 9 2556.4
## 10 2750.1
## 11 2934.4
## 12 2864.4
## 13 2367.4
## 14 1930.1
## 15 1231.8
## 16 170.9
## 17 na
## 18 na
## 19 na
## 20 na
## 21 na
## 22 na
## 23 na
## 24 na
## 25 na
## 26 na
## 27 na
## 28 na
## 29 na
## 30 na
## 31 na
## 32 na
## 33 na
## 34 na
## 35 na
## 36 na
## 37 na
## 38 na
## 39 na
## 40 na
## Total.Subscriptions..4G...At.End.Of.Period..Thousand.
## 1 7769.6
## 2 7650.6
## 3 7367.6
## 4 7658.4
## 5 7461.6
## 6 7334.1
## 7 6566.5
## 8 6103.9
## 9 4854.4
## 10 4076.3
## 11 3181.6
## 12 2050.8
## 13 na
## 14 na
## 15 na
## 16 na
## 17 na
## 18 na
## 19 na
## 20 na
## 21 na
## 22 na
## 23 na
## 24 na
## 25 na
## 26 na
## 27 na
## 28 na
## 29 na
## 30 na
## 31 na
## 32 na
## 33 na
## 34 na
## 35 na
## 36 na
## 37 na
## 38 na
## 39 na
## 40 na
## Total.Post.Paid.Subscriptions..4G...At.End.Of.Period..Thousand.
## 1 5545.5
## 2 5565.4
## 3 5627.7
## 4 6227.9
## 5 5914.2
## 6 5580.1
## 7 4812.2
## 8 4439.5
## 9 4084.3
## 10 3628.6
## 11 2924
## 12 na
## 13 na
## 14 na
## 15 na
## 16 na
## 17 na
## 18 na
## 19 na
## 20 na
## 21 na
## 22 na
## 23 na
## 24 na
## 25 na
## 26 na
## 27 na
## 28 na
## 29 na
## 30 na
## 31 na
## 32 na
## 33 na
## 34 na
## 35 na
## 36 na
## 37 na
## 38 na
## 39 na
## 40 na
## Total.Pre.Paid.Subscriptions..4G...At.End.Of.Period..Thousand.
## 1 2224.1
## 2 2085.2
## 3 1739.9
## 4 1430.5
## 5 1547.4
## 6 1754
## 7 1754.3
## 8 1664.4
## 9 770.1
## 10 447.7
## 11 257.6
## 12 na
## 13 na
## 14 na
## 15 na
## 16 na
## 17 na
## 18 na
## 19 na
## 20 na
## 21 na
## 22 na
## 23 na
## 24 na
## 25 na
## 26 na
## 27 na
## 28 na
## 29 na
## 30 na
## 31 na
## 32 na
## 33 na
## 34 na
## 35 na
## 36 na
## 37 na
## 38 na
## 39 na
## 40 na
## Total.Subscriptions..5G...At.End.Of.Period..Thousand.
## 1 2194
## 2 1874.1
## 3 1242.6
## 4 na
## 5 na
## 6 na
## 7 na
## 8 na
## 9 na
## 10 na
## 11 na
## 12 na
## 13 na
## 14 na
## 15 na
## 16 na
## 17 na
## 18 na
## 19 na
## 20 na
## 21 na
## 22 na
## 23 na
## 24 na
## 25 na
## 26 na
## 27 na
## 28 na
## 29 na
## 30 na
## 31 na
## 32 na
## 33 na
## 34 na
## 35 na
## 36 na
## 37 na
## 38 na
## 39 na
## 40 na
## Total.Post.Paid.Subscriptions..5G...At.End.Of.Period..Thousand.
## 1 1871.4
## 2 1590.8
## 3 1149.5
## 4 na
## 5 na
## 6 na
## 7 na
## 8 na
## 9 na
## 10 na
## 11 na
## 12 na
## 13 na
## 14 na
## 15 na
## 16 na
## 17 na
## 18 na
## 19 na
## 20 na
## 21 na
## 22 na
## 23 na
## 24 na
## 25 na
## 26 na
## 27 na
## 28 na
## 29 na
## 30 na
## 31 na
## 32 na
## 33 na
## 34 na
## 35 na
## 36 na
## 37 na
## 38 na
## 39 na
## 40 na
## Total.Pre.Paid.Subscriptions..5G...At.End.Of.Period..Thousand.
## 1 322.6
## 2 283.3
## 3 93.1
## 4 na
## 5 na
## 6 na
## 7 na
## 8 na
## 9 na
## 10 na
## 11 na
## 12 na
## 13 na
## 14 na
## 15 na
## 16 na
## 17 na
## 18 na
## 19 na
## 20 na
## 21 na
## 22 na
## 23 na
## 24 na
## 25 na
## 26 na
## 27 na
## 28 na
## 29 na
## 30 na
## 31 na
## 32 na
## 33 na
## 34 na
## 35 na
## 36 na
## 37 na
## 38 na
## 39 na
## 40 na
## Total.Fixed.Line.Subscriptions..At.End.Of.Period..Thousand.
## 1 2041.0
## 2 1912.4
## 3 1906.2
## 4 1889.8
## 5 1891.6
## 6 1905.8
## 7 1966.6
## 8 1983.1
## 9 2003.2
## 10 2017.3
## 11 1996.6
## 12 1970.8
## 13 1989.5
## 14 2016.9
## 15 1983.9
## 16 1896.1
## 17 1872.0
## 18 1859.3
## 19 1851.0
## 20 1847.8
## 21 1862.5
## 22 1896.6
## 23 1933.7
## 24 1948.5
## 25 1935.9
## 26 1850.7
## 27 1751.5
## 28 1656.0
## 29 1531.0
## 30 1400.0
## 31 1310.0
## 32 1227.0
## 33 1154.0
## 34 1086.0
## 35 1040.0
## 36 982.0
## 37 945.0
## 38 897.0
## 39 851.0
## 40 818.0
## X..Residential.Fixed.Line.Subscriptions..At.End.Of.Period..Thousand.
## 1 1163.7
## 2 1193.3
## 3 1210.3
## 4 1224.9
## 5 1252.7
## 6 1280.9
## 7 1312.3
## 8 1299.2
## 9 1295.9
## 10 1280.6
## 11 1247.6
## 12 1207.7
## 13 1214.1
## 14 1237.1
## 15 1203.1
## 16 1128.3
## 17 1095.7
## 18 1087.1
## 19 1092.4
## 20 1088.4
## 21 1099.9
## 22 na
## 23 na
## 24 na
## 25 na
## 26 na
## 27 na
## 28 na
## 29 na
## 30 na
## 31 na
## 32 na
## 33 na
## 34 na
## 35 na
## 36 na
## 37 na
## 38 na
## 39 na
## 40 na
## X..Corporate.Business.Fixed.Line.Subscriptions..At.End.Of.Period..Thousand.
## 1 877.3
## 2 719.1
## 3 695.9
## 4 664.9
## 5 638.9
## 6 624.9
## 7 654.3
## 8 683.9
## 9 707.3
## 10 736.8
## 11 749
## 12 763.1
## 13 775.4
## 14 779.7
## 15 780.8
## 16 767.8
## 17 776.4
## 18 772.2
## 19 758.6
## 20 759.4
## 21 762.6
## 22 na
## 23 na
## 24 na
## 25 na
## 26 na
## 27 na
## 28 na
## 29 na
## 30 na
## 31 na
## 32 na
## 33 na
## 34 na
## 35 na
## 36 na
## 37 na
## 38 na
## 39 na
## 40 na
## International.Telephone.Call.Minutes..Million.
## 1 4355.5
## 2 5679.9
## 3 8019.4
## 4 10294.2
## 5 13544.8
## 6 18544.7
## 7 22920.0
## 8 21242.0
## 9 23638.8
## 10 22018.2
## 11 18165.5
## 12 14489.6
## 13 13444.8
## 14 12367.4
## 15 12436.3
## 16 10169.0
## 17 8456.7
## 18 6302.5
## 19 5074.1
## 20 4777.9
## 21 4130.5
## 22 3074.0
## 23 2829.8
## 24 2561.4
## 25 1047.7
## 26 858.8
## 27 790.9
## 28 697.0
## 29 579.0
## 30 550.0
## 31 451.0
## 32 374.0
## 33 324.0
## 34 284.0
## 35 240.0
## 36 204.0
## 37 165.0
## 38 131.0
## 39 104.0
## 40 84.0
## Total.Broadband.Internet.Subscriptions..At.End.Of.Period..Thousand.
## 1 12735.9
## 2 12689.6
## 3 12565
## 4 11902.3
## 5 11170.2
## 6 12217.1
## 7 11820
## 8 13062.8
## 9 12543
## 10 11992.1
## 11 11537.2
## 12 10653.6
## 13 10194.9
## 14 9222.3
## 15 7849.3
## 16 5819
## 17 4717.5
## 18 3250.7
## 19 757.5
## 20 629.6
## 21 506.1
## 22 385.8
## 23 230.2
## 24 na
## 25 na
## 26 na
## 27 na
## 28 na
## 29 na
## 30 na
## 31 na
## 32 na
## 33 na
## 34 na
## 35 na
## 36 na
## 37 na
## 38 na
## 39 na
## 40 na
## X..Total.Wireless.Broadband.Subscriptions..At.End.Of.Period..Thousand.
## 1 11113.7
## 2 11108
## 3 11018
## 4 10382.4
## 5 9652.6
## 6 10719.6
## 7 10330.1
## 8 11592.5
## 9 11088
## 10 10518.8
## 11 10083.3
## 12 9260.1
## 13 8823.9
## 14 7898.9
## 15 6587.2
## 16 4717.4
## 17 na
## 18 na
## 19 na
## 20 na
## 21 na
## 22 na
## 23 na
## 24 na
## 25 na
## 26 na
## 27 na
## 28 na
## 29 na
## 30 na
## 31 na
## 32 na
## 33 na
## 34 na
## 35 na
## 36 na
## 37 na
## 38 na
## 39 na
## 40 na
## X..Residential.Wired.Broadband.Subscriptions..At.End.Of.Period..Thousand.
## 1 1484.9
## 2 1443.7
## 3 1415.3
## 4 1390.2
## 5 1392.2
## 6 1372.8
## 7 1366.4
## 8 1343.2
## 9 1336.2
## 10 1359.9
## 11 1347.5
## 12 1295.4
## 13 1277.6
## 14 1237.9
## 15 na
## 16 na
## 17 na
## 18 na
## 19 na
## 20 na
## 21 na
## 22 na
## 23 na
## 24 na
## 25 na
## 26 na
## 27 na
## 28 na
## 29 na
## 30 na
## 31 na
## 32 na
## 33 na
## 34 na
## 35 na
## 36 na
## 37 na
## 38 na
## 39 na
## 40 na
## X..Corporate.Wired.Broadband.Subscriptions..At.End.Of.Period..Thousand.
## 1 137.3
## 2 137.9
## 3 131.7
## 4 129.7
## 5 125.5
## 6 124.7
## 7 123.6
## 8 127.1
## 9 118.8
## 10 113.4
## 11 106.5
## 12 98
## 13 93.4
## 14 85.5
## 15 na
## 16 na
## 17 na
## 18 na
## 19 na
## 20 na
## 21 na
## 22 na
## 23 na
## 24 na
## 25 na
## 26 na
## 27 na
## 28 na
## 29 na
## 30 na
## 31 na
## 32 na
## 33 na
## 34 na
## 35 na
## 36 na
## 37 na
## 38 na
## 39 na
## 40 na
## Total.Number.Of.SMS.Messages..Million. Mobile.Data.Usage..Petabyte.
## 1 5656 1126.7
## 2 5779.8 1028.5
## 3 6154.3 930
## 4 6098.1 773.8
## 5 6121.6 584
## 6 7033.4 380
## 7 7179.1 226.1
## 8 8487.3 175.5
## 9 9780.3 142.1
## 10 11447.9 123.8
## 11 13508 100.8
## 12 17893.4 85.5
## 13 24052.9 64.6
## 14 28947 43.9
## 15 27714.3 34.4
## 16 23287.2 18.7
## 17 16496.7 4.6
## 18 11902.5 na
## 19 10483.5 na
## 20 8647.5 na
## 21 3956.1 na
## 22 na na
## 23 na na
## 24 na na
## 25 na na
## 26 na na
## 27 na na
## 28 na na
## 29 na na
## 30 na na
## 31 na na
## 32 na na
## 33 na na
## 34 na na
## 35 na na
## 36 na na
## 37 na na
## 38 na na
## 39 na na
## 40 na na
#dplyr & stringr packages
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
library(tidyr)
#select dataset from 2005 to 2024
df <- df %>%
filter(Data.Series >= 2005 & Data.Series <= 2024)
print(df)
## Data.Series Total.Mobile.Phone.Subscriptions..At.End.Of.Period..Thousand.
## 1 2024 9963.5
## 2 2023 9589.6
## 3 2022 9152.4
## 4 2021 8348.5
## 5 2020 8231.2
## 6 2019 8985.1
## 7 2018 8354
## 8 2017 8452.8
## 9 2016 8398.8
## 10 2015 8211.4
## 11 2014 8093.3
## 12 2013 8420.7
## 13 2012 8063
## 14 2011 7755.2
## 15 2010 7288.6
## 16 2009 6857.2
## 17 2008 6340.2
## 18 2007 5619.2
## 19 2006 4637.7
## 20 2005 4256.8
## Total.Post.Paid.Subscriptions..2G...At.End.Of.Period..Thousand.
## 1 na
## 2 na
## 3 na
## 4 na
## 5 na
## 6 na
## 7 na
## 8 na
## 9 30.9
## 10 49.2
## 11 89.8
## 12 117.8
## 13 152.7
## 14 204.5
## 15 289.8
## 16 456.2
## 17 791.6
## 18 1318.1
## 19 1960.1
## 20 2565.1
## Total.Pre.Paid.Subscriptions..2G...At.End.Of.Period..Thousand.
## 1 na
## 2 na
## 3 na
## 4 na
## 5 na
## 6 na
## 7 na
## 8 na
## 9 101.4
## 10 172.4
## 11 213.1
## 12 993.7
## 13 1429.7
## 14 1795.3
## 15 2294.4
## 16 3240.7
## 17 3075
## 18 2600
## 19 1793.7
## 20 1516.9
## Total.Subscriptions..3G...At.End.Of.Period..Thousand.
## 1 na
## 2 64.9
## 3 542.2
## 4 690.1
## 5 769.6
## 6 1651.1
## 7 1787.5
## 8 2349
## 9 3412.1
## 10 3913.5
## 11 4608.8
## 12 5258.3
## 13 6480.6
## 14 5755.4
## 15 4704.5
## 16 3160.3
## 17 2473.6
## 18 1701
## 19 884
## 20 174.8
## Total.Post.Paid.Subscriptions..3G...At.End.Of.Period..Thousand.
## 1 na
## 2 24
## 3 41.2
## 4 80.3
## 5 96.5
## 6 467
## 7 540.5
## 8 650
## 9 855.7
## 10 1163.4
## 11 1674.4
## 12 2393.9
## 13 4113.2
## 14 3825.3
## 15 3472.7
## 16 2989.4
## 17 na
## 18 na
## 19 na
## 20 na
## Total.Pre.Paid.Subscriptions..3G...At.End.Of.Period..Thousand.
## 1 na
## 2 40.9
## 3 501
## 4 609.8
## 5 673.1
## 6 1184.1
## 7 1247
## 8 1699
## 9 2556.4
## 10 2750.1
## 11 2934.4
## 12 2864.4
## 13 2367.4
## 14 1930.1
## 15 1231.8
## 16 170.9
## 17 na
## 18 na
## 19 na
## 20 na
## Total.Subscriptions..4G...At.End.Of.Period..Thousand.
## 1 7769.6
## 2 7650.6
## 3 7367.6
## 4 7658.4
## 5 7461.6
## 6 7334.1
## 7 6566.5
## 8 6103.9
## 9 4854.4
## 10 4076.3
## 11 3181.6
## 12 2050.8
## 13 na
## 14 na
## 15 na
## 16 na
## 17 na
## 18 na
## 19 na
## 20 na
## Total.Post.Paid.Subscriptions..4G...At.End.Of.Period..Thousand.
## 1 5545.5
## 2 5565.4
## 3 5627.7
## 4 6227.9
## 5 5914.2
## 6 5580.1
## 7 4812.2
## 8 4439.5
## 9 4084.3
## 10 3628.6
## 11 2924
## 12 na
## 13 na
## 14 na
## 15 na
## 16 na
## 17 na
## 18 na
## 19 na
## 20 na
## Total.Pre.Paid.Subscriptions..4G...At.End.Of.Period..Thousand.
## 1 2224.1
## 2 2085.2
## 3 1739.9
## 4 1430.5
## 5 1547.4
## 6 1754
## 7 1754.3
## 8 1664.4
## 9 770.1
## 10 447.7
## 11 257.6
## 12 na
## 13 na
## 14 na
## 15 na
## 16 na
## 17 na
## 18 na
## 19 na
## 20 na
## Total.Subscriptions..5G...At.End.Of.Period..Thousand.
## 1 2194
## 2 1874.1
## 3 1242.6
## 4 na
## 5 na
## 6 na
## 7 na
## 8 na
## 9 na
## 10 na
## 11 na
## 12 na
## 13 na
## 14 na
## 15 na
## 16 na
## 17 na
## 18 na
## 19 na
## 20 na
## Total.Post.Paid.Subscriptions..5G...At.End.Of.Period..Thousand.
## 1 1871.4
## 2 1590.8
## 3 1149.5
## 4 na
## 5 na
## 6 na
## 7 na
## 8 na
## 9 na
## 10 na
## 11 na
## 12 na
## 13 na
## 14 na
## 15 na
## 16 na
## 17 na
## 18 na
## 19 na
## 20 na
## Total.Pre.Paid.Subscriptions..5G...At.End.Of.Period..Thousand.
## 1 322.6
## 2 283.3
## 3 93.1
## 4 na
## 5 na
## 6 na
## 7 na
## 8 na
## 9 na
## 10 na
## 11 na
## 12 na
## 13 na
## 14 na
## 15 na
## 16 na
## 17 na
## 18 na
## 19 na
## 20 na
## Total.Fixed.Line.Subscriptions..At.End.Of.Period..Thousand.
## 1 2041.0
## 2 1912.4
## 3 1906.2
## 4 1889.8
## 5 1891.6
## 6 1905.8
## 7 1966.6
## 8 1983.1
## 9 2003.2
## 10 2017.3
## 11 1996.6
## 12 1970.8
## 13 1989.5
## 14 2016.9
## 15 1983.9
## 16 1896.1
## 17 1872.0
## 18 1859.3
## 19 1851.0
## 20 1847.8
## X..Residential.Fixed.Line.Subscriptions..At.End.Of.Period..Thousand.
## 1 1163.7
## 2 1193.3
## 3 1210.3
## 4 1224.9
## 5 1252.7
## 6 1280.9
## 7 1312.3
## 8 1299.2
## 9 1295.9
## 10 1280.6
## 11 1247.6
## 12 1207.7
## 13 1214.1
## 14 1237.1
## 15 1203.1
## 16 1128.3
## 17 1095.7
## 18 1087.1
## 19 1092.4
## 20 1088.4
## X..Corporate.Business.Fixed.Line.Subscriptions..At.End.Of.Period..Thousand.
## 1 877.3
## 2 719.1
## 3 695.9
## 4 664.9
## 5 638.9
## 6 624.9
## 7 654.3
## 8 683.9
## 9 707.3
## 10 736.8
## 11 749
## 12 763.1
## 13 775.4
## 14 779.7
## 15 780.8
## 16 767.8
## 17 776.4
## 18 772.2
## 19 758.6
## 20 759.4
## International.Telephone.Call.Minutes..Million.
## 1 4355.5
## 2 5679.9
## 3 8019.4
## 4 10294.2
## 5 13544.8
## 6 18544.7
## 7 22920.0
## 8 21242.0
## 9 23638.8
## 10 22018.2
## 11 18165.5
## 12 14489.6
## 13 13444.8
## 14 12367.4
## 15 12436.3
## 16 10169.0
## 17 8456.7
## 18 6302.5
## 19 5074.1
## 20 4777.9
## Total.Broadband.Internet.Subscriptions..At.End.Of.Period..Thousand.
## 1 12735.9
## 2 12689.6
## 3 12565
## 4 11902.3
## 5 11170.2
## 6 12217.1
## 7 11820
## 8 13062.8
## 9 12543
## 10 11992.1
## 11 11537.2
## 12 10653.6
## 13 10194.9
## 14 9222.3
## 15 7849.3
## 16 5819
## 17 4717.5
## 18 3250.7
## 19 757.5
## 20 629.6
## X..Total.Wireless.Broadband.Subscriptions..At.End.Of.Period..Thousand.
## 1 11113.7
## 2 11108
## 3 11018
## 4 10382.4
## 5 9652.6
## 6 10719.6
## 7 10330.1
## 8 11592.5
## 9 11088
## 10 10518.8
## 11 10083.3
## 12 9260.1
## 13 8823.9
## 14 7898.9
## 15 6587.2
## 16 4717.4
## 17 na
## 18 na
## 19 na
## 20 na
## X..Residential.Wired.Broadband.Subscriptions..At.End.Of.Period..Thousand.
## 1 1484.9
## 2 1443.7
## 3 1415.3
## 4 1390.2
## 5 1392.2
## 6 1372.8
## 7 1366.4
## 8 1343.2
## 9 1336.2
## 10 1359.9
## 11 1347.5
## 12 1295.4
## 13 1277.6
## 14 1237.9
## 15 na
## 16 na
## 17 na
## 18 na
## 19 na
## 20 na
## X..Corporate.Wired.Broadband.Subscriptions..At.End.Of.Period..Thousand.
## 1 137.3
## 2 137.9
## 3 131.7
## 4 129.7
## 5 125.5
## 6 124.7
## 7 123.6
## 8 127.1
## 9 118.8
## 10 113.4
## 11 106.5
## 12 98
## 13 93.4
## 14 85.5
## 15 na
## 16 na
## 17 na
## 18 na
## 19 na
## 20 na
## Total.Number.Of.SMS.Messages..Million. Mobile.Data.Usage..Petabyte.
## 1 5656 1126.7
## 2 5779.8 1028.5
## 3 6154.3 930
## 4 6098.1 773.8
## 5 6121.6 584
## 6 7033.4 380
## 7 7179.1 226.1
## 8 8487.3 175.5
## 9 9780.3 142.1
## 10 11447.9 123.8
## 11 13508 100.8
## 12 17893.4 85.5
## 13 24052.9 64.6
## 14 28947 43.9
## 15 27714.3 34.4
## 16 23287.2 18.7
## 17 16496.7 4.6
## 18 11902.5 na
## 19 10483.5 na
## 20 8647.5 na
glimpse(df)
## Rows: 20
## Columns: 23
## $ Data.Series <int> …
## $ Total.Mobile.Phone.Subscriptions..At.End.Of.Period..Thousand. <chr> …
## $ Total.Post.Paid.Subscriptions..2G...At.End.Of.Period..Thousand. <chr> …
## $ Total.Pre.Paid.Subscriptions..2G...At.End.Of.Period..Thousand. <chr> …
## $ Total.Subscriptions..3G...At.End.Of.Period..Thousand. <chr> …
## $ Total.Post.Paid.Subscriptions..3G...At.End.Of.Period..Thousand. <chr> …
## $ Total.Pre.Paid.Subscriptions..3G...At.End.Of.Period..Thousand. <chr> …
## $ Total.Subscriptions..4G...At.End.Of.Period..Thousand. <chr> …
## $ Total.Post.Paid.Subscriptions..4G...At.End.Of.Period..Thousand. <chr> …
## $ Total.Pre.Paid.Subscriptions..4G...At.End.Of.Period..Thousand. <chr> …
## $ Total.Subscriptions..5G...At.End.Of.Period..Thousand. <chr> …
## $ Total.Post.Paid.Subscriptions..5G...At.End.Of.Period..Thousand. <chr> …
## $ Total.Pre.Paid.Subscriptions..5G...At.End.Of.Period..Thousand. <chr> …
## $ Total.Fixed.Line.Subscriptions..At.End.Of.Period..Thousand. <dbl> …
## $ X..Residential.Fixed.Line.Subscriptions..At.End.Of.Period..Thousand. <chr> …
## $ X..Corporate.Business.Fixed.Line.Subscriptions..At.End.Of.Period..Thousand. <chr> …
## $ International.Telephone.Call.Minutes..Million. <dbl> …
## $ Total.Broadband.Internet.Subscriptions..At.End.Of.Period..Thousand. <chr> …
## $ X..Total.Wireless.Broadband.Subscriptions..At.End.Of.Period..Thousand. <chr> …
## $ X..Residential.Wired.Broadband.Subscriptions..At.End.Of.Period..Thousand. <chr> …
## $ X..Corporate.Wired.Broadband.Subscriptions..At.End.Of.Period..Thousand. <chr> …
## $ Total.Number.Of.SMS.Messages..Million. <chr> …
## $ Mobile.Data.Usage..Petabyte. <chr> …
#select important columns that need to be proceed analysis
important_columns <- c(
"Data.Series",
"Total.Mobile.Phone.Subscriptions..At.End.Of.Period..Thousand.",
"Total.Fixed.Line.Subscriptions..At.End.Of.Period..Thousand.",
"Total.Broadband.Internet.Subscriptions..At.End.Of.Period..Thousand.",
"International.Telephone.Call.Minutes..Million.",
"Total.Number.Of.SMS.Messages..Million.",
"Mobile.Data.Usage..Petabyte."
)
df_summary <- df %>%
select(all_of(important_columns))
print(df_summary)
## Data.Series Total.Mobile.Phone.Subscriptions..At.End.Of.Period..Thousand.
## 1 2024 9963.5
## 2 2023 9589.6
## 3 2022 9152.4
## 4 2021 8348.5
## 5 2020 8231.2
## 6 2019 8985.1
## 7 2018 8354
## 8 2017 8452.8
## 9 2016 8398.8
## 10 2015 8211.4
## 11 2014 8093.3
## 12 2013 8420.7
## 13 2012 8063
## 14 2011 7755.2
## 15 2010 7288.6
## 16 2009 6857.2
## 17 2008 6340.2
## 18 2007 5619.2
## 19 2006 4637.7
## 20 2005 4256.8
## Total.Fixed.Line.Subscriptions..At.End.Of.Period..Thousand.
## 1 2041.0
## 2 1912.4
## 3 1906.2
## 4 1889.8
## 5 1891.6
## 6 1905.8
## 7 1966.6
## 8 1983.1
## 9 2003.2
## 10 2017.3
## 11 1996.6
## 12 1970.8
## 13 1989.5
## 14 2016.9
## 15 1983.9
## 16 1896.1
## 17 1872.0
## 18 1859.3
## 19 1851.0
## 20 1847.8
## Total.Broadband.Internet.Subscriptions..At.End.Of.Period..Thousand.
## 1 12735.9
## 2 12689.6
## 3 12565
## 4 11902.3
## 5 11170.2
## 6 12217.1
## 7 11820
## 8 13062.8
## 9 12543
## 10 11992.1
## 11 11537.2
## 12 10653.6
## 13 10194.9
## 14 9222.3
## 15 7849.3
## 16 5819
## 17 4717.5
## 18 3250.7
## 19 757.5
## 20 629.6
## International.Telephone.Call.Minutes..Million.
## 1 4355.5
## 2 5679.9
## 3 8019.4
## 4 10294.2
## 5 13544.8
## 6 18544.7
## 7 22920.0
## 8 21242.0
## 9 23638.8
## 10 22018.2
## 11 18165.5
## 12 14489.6
## 13 13444.8
## 14 12367.4
## 15 12436.3
## 16 10169.0
## 17 8456.7
## 18 6302.5
## 19 5074.1
## 20 4777.9
## Total.Number.Of.SMS.Messages..Million. Mobile.Data.Usage..Petabyte.
## 1 5656 1126.7
## 2 5779.8 1028.5
## 3 6154.3 930
## 4 6098.1 773.8
## 5 6121.6 584
## 6 7033.4 380
## 7 7179.1 226.1
## 8 8487.3 175.5
## 9 9780.3 142.1
## 10 11447.9 123.8
## 11 13508 100.8
## 12 17893.4 85.5
## 13 24052.9 64.6
## 14 28947 43.9
## 15 27714.3 34.4
## 16 23287.2 18.7
## 17 16496.7 4.6
## 18 11902.5 na
## 19 10483.5 na
## 20 8647.5 na
df_summary <- df_summary %>%
rename_with(~ case_when(
.x == "Data.Series" ~ "data_series",
.x == "Total.Mobile.Phone.Subscriptions..At.End.Of.Period..Thousand." ~ "total_mobile_phone_subscriptions",
.x == "Total.Fixed.Line.Subscriptions..At.End.Of.Period..Thousand." ~ "total_fixed_line_subscriptions",
.x == "Total.Broadband.Internet.Subscriptions..At.End.Of.Period..Thousand." ~ "total_broadband_internet_subscriptions",
.x == "International.Telephone.Call.Minutes..Million." ~ "international_telephone_call_minutes",
.x == "Total.Number.Of.SMS.Messages..Million." ~ "total_number_of_sms_messages",
.x == "Mobile.Data.Usage..Petabyte." ~ "mobile_data_usage",
TRUE ~ .x
))
print(df_summary)
## data_series total_mobile_phone_subscriptions total_fixed_line_subscriptions
## 1 2024 9963.5 2041.0
## 2 2023 9589.6 1912.4
## 3 2022 9152.4 1906.2
## 4 2021 8348.5 1889.8
## 5 2020 8231.2 1891.6
## 6 2019 8985.1 1905.8
## 7 2018 8354 1966.6
## 8 2017 8452.8 1983.1
## 9 2016 8398.8 2003.2
## 10 2015 8211.4 2017.3
## 11 2014 8093.3 1996.6
## 12 2013 8420.7 1970.8
## 13 2012 8063 1989.5
## 14 2011 7755.2 2016.9
## 15 2010 7288.6 1983.9
## 16 2009 6857.2 1896.1
## 17 2008 6340.2 1872.0
## 18 2007 5619.2 1859.3
## 19 2006 4637.7 1851.0
## 20 2005 4256.8 1847.8
## total_broadband_internet_subscriptions international_telephone_call_minutes
## 1 12735.9 4355.5
## 2 12689.6 5679.9
## 3 12565 8019.4
## 4 11902.3 10294.2
## 5 11170.2 13544.8
## 6 12217.1 18544.7
## 7 11820 22920.0
## 8 13062.8 21242.0
## 9 12543 23638.8
## 10 11992.1 22018.2
## 11 11537.2 18165.5
## 12 10653.6 14489.6
## 13 10194.9 13444.8
## 14 9222.3 12367.4
## 15 7849.3 12436.3
## 16 5819 10169.0
## 17 4717.5 8456.7
## 18 3250.7 6302.5
## 19 757.5 5074.1
## 20 629.6 4777.9
## total_number_of_sms_messages mobile_data_usage
## 1 5656 1126.7
## 2 5779.8 1028.5
## 3 6154.3 930
## 4 6098.1 773.8
## 5 6121.6 584
## 6 7033.4 380
## 7 7179.1 226.1
## 8 8487.3 175.5
## 9 9780.3 142.1
## 10 11447.9 123.8
## 11 13508 100.8
## 12 17893.4 85.5
## 13 24052.9 64.6
## 14 28947 43.9
## 15 27714.3 34.4
## 16 23287.2 18.7
## 17 16496.7 4.6
## 18 11902.5 na
## 19 10483.5 na
## 20 8647.5 na
#fill in na values to 0
df_cleaned <- df_summary %>%
mutate(
across(c(total_mobile_phone_subscriptions,total_broadband_internet_subscriptions,total_number_of_sms_messages, mobile_data_usage), ~{
.x %>%
na_if("na") %>% # Convert "na" string to NA
as.numeric() %>% # Convert to numeric
replace_na(0) # Replace NA with 0
})
)
print(df_cleaned)
## data_series total_mobile_phone_subscriptions total_fixed_line_subscriptions
## 1 2024 9963.5 2041.0
## 2 2023 9589.6 1912.4
## 3 2022 9152.4 1906.2
## 4 2021 8348.5 1889.8
## 5 2020 8231.2 1891.6
## 6 2019 8985.1 1905.8
## 7 2018 8354.0 1966.6
## 8 2017 8452.8 1983.1
## 9 2016 8398.8 2003.2
## 10 2015 8211.4 2017.3
## 11 2014 8093.3 1996.6
## 12 2013 8420.7 1970.8
## 13 2012 8063.0 1989.5
## 14 2011 7755.2 2016.9
## 15 2010 7288.6 1983.9
## 16 2009 6857.2 1896.1
## 17 2008 6340.2 1872.0
## 18 2007 5619.2 1859.3
## 19 2006 4637.7 1851.0
## 20 2005 4256.8 1847.8
## total_broadband_internet_subscriptions international_telephone_call_minutes
## 1 12735.9 4355.5
## 2 12689.6 5679.9
## 3 12565.0 8019.4
## 4 11902.3 10294.2
## 5 11170.2 13544.8
## 6 12217.1 18544.7
## 7 11820.0 22920.0
## 8 13062.8 21242.0
## 9 12543.0 23638.8
## 10 11992.1 22018.2
## 11 11537.2 18165.5
## 12 10653.6 14489.6
## 13 10194.9 13444.8
## 14 9222.3 12367.4
## 15 7849.3 12436.3
## 16 5819.0 10169.0
## 17 4717.5 8456.7
## 18 3250.7 6302.5
## 19 757.5 5074.1
## 20 629.6 4777.9
## total_number_of_sms_messages mobile_data_usage
## 1 5656.0 1126.7
## 2 5779.8 1028.5
## 3 6154.3 930.0
## 4 6098.1 773.8
## 5 6121.6 584.0
## 6 7033.4 380.0
## 7 7179.1 226.1
## 8 8487.3 175.5
## 9 9780.3 142.1
## 10 11447.9 123.8
## 11 13508.0 100.8
## 12 17893.4 85.5
## 13 24052.9 64.6
## 14 28947.0 43.9
## 15 27714.3 34.4
## 16 23287.2 18.7
## 17 16496.7 4.6
## 18 11902.5 0.0
## 19 10483.5 0.0
## 20 8647.5 0.0
# For data manipulation, data visualization, reshaping data (useful for plotting)
library(dplyr)
library(ggplot2)
library(tidyr)
#Break down every metric for EDA
df_long_trends <- df_cleaned %>%
pivot_longer(
cols = -data_series, # All columns except 'data_series'
names_to = "Metric",
values_to = "Value"
)
# Grouped line plot for all metrics
ggplot(df_long_trends, aes(x = data_series, y = Value, color = Metric)) +
geom_line(linewidth = 1) +
geom_point(size = 2) +
facet_wrap(~ Metric, scales = "free_y", ncol = 2) +
labs(
title = "Trends of Various Telecommunication Metrics Over Time",
x = "Year",
y = "Value"
) +
scale_x_continuous(breaks = unique(df_cleaned$data_series)) +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, face = "bold"),
axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "bottom")
# correlation matrix to identify the correlation relationship
print(df_cleaned)
## data_series total_mobile_phone_subscriptions total_fixed_line_subscriptions
## 1 2024 9963.5 2041.0
## 2 2023 9589.6 1912.4
## 3 2022 9152.4 1906.2
## 4 2021 8348.5 1889.8
## 5 2020 8231.2 1891.6
## 6 2019 8985.1 1905.8
## 7 2018 8354.0 1966.6
## 8 2017 8452.8 1983.1
## 9 2016 8398.8 2003.2
## 10 2015 8211.4 2017.3
## 11 2014 8093.3 1996.6
## 12 2013 8420.7 1970.8
## 13 2012 8063.0 1989.5
## 14 2011 7755.2 2016.9
## 15 2010 7288.6 1983.9
## 16 2009 6857.2 1896.1
## 17 2008 6340.2 1872.0
## 18 2007 5619.2 1859.3
## 19 2006 4637.7 1851.0
## 20 2005 4256.8 1847.8
## total_broadband_internet_subscriptions international_telephone_call_minutes
## 1 12735.9 4355.5
## 2 12689.6 5679.9
## 3 12565.0 8019.4
## 4 11902.3 10294.2
## 5 11170.2 13544.8
## 6 12217.1 18544.7
## 7 11820.0 22920.0
## 8 13062.8 21242.0
## 9 12543.0 23638.8
## 10 11992.1 22018.2
## 11 11537.2 18165.5
## 12 10653.6 14489.6
## 13 10194.9 13444.8
## 14 9222.3 12367.4
## 15 7849.3 12436.3
## 16 5819.0 10169.0
## 17 4717.5 8456.7
## 18 3250.7 6302.5
## 19 757.5 5074.1
## 20 629.6 4777.9
## total_number_of_sms_messages mobile_data_usage
## 1 5656.0 1126.7
## 2 5779.8 1028.5
## 3 6154.3 930.0
## 4 6098.1 773.8
## 5 6121.6 584.0
## 6 7033.4 380.0
## 7 7179.1 226.1
## 8 8487.3 175.5
## 9 9780.3 142.1
## 10 11447.9 123.8
## 11 13508.0 100.8
## 12 17893.4 85.5
## 13 24052.9 64.6
## 14 28947.0 43.9
## 15 27714.3 34.4
## 16 23287.2 18.7
## 17 16496.7 4.6
## 18 11902.5 0.0
## 19 10483.5 0.0
## 20 8647.5 0.0
library(corrplot)
## corrplot 0.95 loaded
correlation_matrix <- cor(df_cleaned)
print("\n--- Correlation Matrix (Rounded to 2 Decimal Places) ---")
## [1] "\n--- Correlation Matrix (Rounded to 2 Decimal Places) ---"
print(round(correlation_matrix, 2))
## data_series
## data_series 1.00
## total_mobile_phone_subscriptions 0.90
## total_fixed_line_subscriptions 0.33
## total_broadband_internet_subscriptions 0.87
## international_telephone_call_minutes 0.20
## total_number_of_sms_messages -0.58
## mobile_data_usage 0.87
## total_mobile_phone_subscriptions
## data_series 0.90
## total_mobile_phone_subscriptions 1.00
## total_fixed_line_subscriptions 0.60
## total_broadband_internet_subscriptions 0.96
## international_telephone_call_minutes 0.37
## total_number_of_sms_messages -0.24
## mobile_data_usage 0.68
## total_fixed_line_subscriptions
## data_series 0.33
## total_mobile_phone_subscriptions 0.60
## total_fixed_line_subscriptions 1.00
## total_broadband_internet_subscriptions 0.64
## international_telephone_call_minutes 0.54
## total_number_of_sms_messages 0.28
## mobile_data_usage 0.05
## total_broadband_internet_subscriptions
## data_series 0.87
## total_mobile_phone_subscriptions 0.96
## total_fixed_line_subscriptions 0.64
## total_broadband_internet_subscriptions 1.00
## international_telephone_call_minutes 0.56
## total_number_of_sms_messages -0.26
## mobile_data_usage 0.56
## international_telephone_call_minutes
## data_series 0.20
## total_mobile_phone_subscriptions 0.37
## total_fixed_line_subscriptions 0.54
## total_broadband_internet_subscriptions 0.56
## international_telephone_call_minutes 1.00
## total_number_of_sms_messages 0.00
## mobile_data_usage -0.29
## total_number_of_sms_messages
## data_series -0.58
## total_mobile_phone_subscriptions -0.24
## total_fixed_line_subscriptions 0.28
## total_broadband_internet_subscriptions -0.26
## international_telephone_call_minutes 0.00
## total_number_of_sms_messages 1.00
## mobile_data_usage -0.60
## mobile_data_usage
## data_series 0.87
## total_mobile_phone_subscriptions 0.68
## total_fixed_line_subscriptions 0.05
## total_broadband_internet_subscriptions 0.56
## international_telephone_call_minutes -0.29
## total_number_of_sms_messages -0.60
## mobile_data_usage 1.00
corrplot(correlation_matrix,
method = "square",
type = "full",
order = "hclust", # Reorders variables using hierarchical clustering to group similar correlations
tl.col = "black", # Color of text labels (column/row names)
tl.srt = 45, # Angle of text labels
col = colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))(200), # Custom color palette for strong negatives (red) to strong positives (blue)
addCoef.col = "black",
cl.pos = "b",
mar = c(0,0,1,0),
title = "Correlation Matrix of All Telecommunication Metrics", # Plot title
tl.cex = 0.5,
cl.cex = 0.5,
)
Model development
# Define the split point
train_end_year <- 2019
# Split the data frame
train_df <- df_cleaned %>% filter(data_series <= train_end_year)
test_df <- df_cleaned %>% filter(data_series > train_end_year)
# Model Training
lm_simple <- lm(total_mobile_phone_subscriptions ~ data_series, data = df_cleaned)
print(summary(lm_simple))
##
## Call:
## lm(formula = total_mobile_phone_subscriptions ~ data_series,
## data = df_cleaned)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1292.1 -351.1 109.1 400.4 1017.4
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -459205.60 52790.80 -8.699 7.29e-08 ***
## data_series 231.80 26.21 8.845 5.70e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 675.8 on 18 degrees of freedom
## Multiple R-squared: 0.813, Adjusted R-squared: 0.8026
## F-statistic: 78.24 on 1 and 18 DF, p-value: 5.696e-08
# Model Forecasting
predictions_simple <- predict(lm_simple, newdata = test_df)
print(predictions_simple)
## 1 2 3 4 5
## 9953.039 9721.241 9489.443 9257.645 9025.848
# Model Evaluation
actual_values_simple <- test_df$total_mobile_phone_subscriptions
mae_simple <- mean(abs(actual_values_simple - predictions_simple))
rmse_simple <- sqrt(mean((actual_values_simple - predictions_simple)^2))
mape_simple <- mean(abs((actual_values_simple - predictions_simple) / actual_values_simple)) * 100
print(paste0("Simple LR - MAE: ", round(mae_simple, 2)))
## [1] "Simple LR - MAE: 436.59"
print(paste0("Simple LR - RMSE: ", round(rmse_simple, 2)))
## [1] "Simple LR - RMSE: 563.75"
print(paste0("Simple LR - MAPE: ", round(mape_simple, 2), "%"))
## [1] "Simple LR - MAPE: 5.14%"
# Combine actual and predicted values for plotting
plot_data_simple <- df_cleaned %>%
select(data_series, total_mobile_phone_subscriptions) %>%
mutate(
Predicted = ifelse(data_series > train_end_year, predictions_simple, NA),
Type = "Actual"
)
# Add fitted values for the training period
fitted_values_simple <- predict(lm_simple, newdata = train_df)
plot_data_simple$Predicted[plot_data_simple$data_series <= train_end_year] <- fitted_values_simple
ggplot(plot_data_simple, aes(x = data_series)) +
geom_line(aes(y = total_mobile_phone_subscriptions, color = "Actual"), linewidth = 1) +
geom_point(aes(y = total_mobile_phone_subscriptions, color = "Actual"), size = 2) +
geom_line(aes(y = Predicted, color = "Predicted"), linewidth = 1, linetype = "dashed") +
geom_point(aes(y = Predicted, color = "Predicted"), size = 2) +
labs(
title = "Simple Linear Regression: Mobile Subscriptions Forecast",
subtitle = paste0("Train End Year: ", train_end_year, " | MAE: ", round(mae_simple, 2), " | RMSE: ", round(rmse_simple, 2)),
x = "Year",
y = "Mobile Subscriptions (Thousands)",
color = "Type"
) +
scale_x_continuous(breaks = unique(df_cleaned$data_series)) +
scale_color_manual(values = c("Actual" = "blue", "Predicted" = "red")) +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, face = "bold"),
plot.subtitle = element_text(hjust = 0.5),
legend.position = "bottom")
Model development
# Model Training
# Select predictors based on insights
lm_multiple <- lm(
total_mobile_phone_subscriptions ~ data_series +
total_broadband_internet_subscriptions +
mobile_data_usage +
total_fixed_line_subscriptions +
international_telephone_call_minutes +
total_number_of_sms_messages,
data = train_df
)
print(summary(lm_multiple))
##
## Call:
## lm(formula = total_mobile_phone_subscriptions ~ data_series +
## total_broadband_internet_subscriptions + mobile_data_usage +
## total_fixed_line_subscriptions + international_telephone_call_minutes +
## total_number_of_sms_messages, data = train_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -271.15 -57.36 23.89 46.54 232.21
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.468e+05 2.573e+05 -0.571 0.58397
## data_series 8.133e+01 1.277e+02 0.637 0.54183
## total_broadband_internet_subscriptions 3.759e-01 7.801e-02 4.818 0.00132
## mobile_data_usage -5.600e-01 2.140e+00 -0.262 0.80019
## total_fixed_line_subscriptions -6.741e+00 2.342e+00 -2.879 0.02056
## international_telephone_call_minutes -1.723e-02 3.913e-02 -0.440 0.67130
## total_number_of_sms_messages 4.739e-02 1.124e-02 4.218 0.00292
##
## (Intercept)
## data_series
## total_broadband_internet_subscriptions **
## mobile_data_usage
## total_fixed_line_subscriptions *
## international_telephone_call_minutes
## total_number_of_sms_messages **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 151.5 on 8 degrees of freedom
## Multiple R-squared: 0.9939, Adjusted R-squared: 0.9894
## F-statistic: 218.5 on 6 and 8 DF, p-value: 2.009e-08
# Model Forecasting (Prediction)
predictions_multiple <- predict(lm_multiple, newdata = test_df)
print(predictions_multiple)
## 1 2 3 4 5
## 8404.317 9210.469 9156.680 8982.398 8665.124
# Model Evaluation
actual_values_multiple <- test_df$total_mobile_phone_subscriptions
mae_multiple <- mean(abs(actual_values_multiple - predictions_multiple))
rmse_multiple <- sqrt(mean((actual_values_multiple - predictions_multiple)^2))
mape_multiple <- mean(abs((actual_values_multiple - predictions_multiple) / actual_values_multiple)) * 100
print(paste0("Multiple LR - MAE: ", round(mae_multiple, 2)))
## [1] "Multiple LR - MAE: 602.08"
print(paste0("Multiple LR - RMSE: ", round(rmse_multiple, 2)))
## [1] "Multiple LR - RMSE: 795.6"
print(paste0("Multiple LR - MAPE: ", round(mape_multiple, 2), "%"))
## [1] "Multiple LR - MAPE: 6.5%"
# Visualization
plot_data_multiple <- df_cleaned %>%
select(data_series, total_mobile_phone_subscriptions) %>%
mutate(
Predicted = ifelse(data_series > train_end_year, predictions_multiple, NA),
Type = "Actual"
)
fitted_values_multiple <- predict(lm_multiple, newdata = train_df)
plot_data_multiple$Predicted[plot_data_multiple$data_series <= train_end_year] <- fitted_values_multiple
library(ggplot2)
ggplot(plot_data_multiple, aes(x = data_series)) +
geom_line(aes(y = total_mobile_phone_subscriptions, color = "Actual"), linewidth = 1) +
geom_point(aes(y = total_mobile_phone_subscriptions, color = "Actual"), size = 2) +
geom_line(aes(y = Predicted, color = "Predicted"), linewidth = 1, linetype = "dashed") +
geom_point(aes(y = Predicted, color = "Predicted"), size = 2) +
labs(
title = "Multiple Linear Regression: Mobile Subscriptions Forecast",
subtitle = paste0("Train End Year: ", train_end_year, " | MAE: ", round(mae_multiple, 2), " | RMSE: ", round(rmse_multiple, 2)),
x = "Year",
y = "Total Mobile Subscriptions (Thousands)",
color = "Type"
) +
scale_x_continuous(breaks = unique(df_cleaned$data_series)) +
scale_color_manual(values = c("Actual" = "blue", "Predicted" = "red")) +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, face = "bold"),
plot.subtitle = element_text(hjust = 0.5),
legend.position = "bottom")
Model Development
library(dplyr)
library(ggplot2)
library(tidyr)
features_base <- c("data_series",
"total_broadband_internet_subscriptions",
"mobile_data_usage",
"total_fixed_line_subscriptions",
"international_telephone_call_minutes",
"total_number_of_sms_messages")
# Define the split point
train_end_year <- 2019
# Data splitting
train_df_rf <- df_cleaned %>% filter(data_series <= train_end_year)
test_df_rf <- df_cleaned %>% filter(data_series > train_end_year)
# Define the full set of features for Random Forest
features_rf <- features_base
target_metric <- c("total_mobile_phone_subscriptions")
# Create the formula for Random Forest
rf <- as.formula(paste(target_metric, "~", paste(features_rf, collapse = " + ")))
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
rf_model <- randomForest(
formula = rf,data = train_df_rf,ntree = 500,mtry = max(floor(length(features_rf) / 3), 1),importance = TRUE,na.action = na.omit)
print(rf_model)
##
## Call:
## randomForest(formula = rf, data = train_df_rf, ntree = 500, mtry = max(floor(length(features_rf)/3), 1), importance = TRUE, na.action = na.omit)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 2
##
## Mean of squared residuals: 306669
## % Var explained: 84.81
Model Evaluation
library(forecast)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
# --- Evaluation for Random Forest Model ---
predictions_rf <- predict(rf_model, newdata = test_df_rf)
accuracy_rf <- accuracy(f = predictions_rf, x = test_df_rf[[target_metric]])
## Warning in accuracy.default(f = predictions_rf, x =
## test_df_rf[[target_metric]]): Using `f` as the argument for `accuracy()` is
## deprecated. Please use `object` instead.
print(accuracy_rf)
## ME RMSE MAE MPE MAPE
## Test set 1004.717 1322.003 1090.952 10.44344 11.4911
# --- Visualize Random Forest Predictions ---
plot_df_rf <- df_cleaned %>%
select(data_series, !!sym(target_metric)) %>%
rename(Actual = !!sym(target_metric)) %>%
mutate(Predicted = NA_real_)
plot_df_rf$Predicted[plot_df_rf$data_series %in% test_df_rf$data_series] <- predictions_rf
fitted_rf_train <- predict(rf_model, newdata = train_df_rf)
plot_df_rf$Predicted[plot_df_rf$data_series %in% train_df_rf$data_series] <- fitted_rf_train
ggplot(plot_df_rf, aes(x = data_series)) +
geom_line(aes(y = Actual, color = "Actual"), linewidth = 1) +
geom_point(aes(y = Actual, color = "Actual"), size = 2) +
geom_line(aes(y = Predicted, color = "Predicted"), linewidth = 1, linetype = "dashed") +
geom_point(aes(y = Predicted, color = "Predicted"), size = 2) +
labs(
title = paste0("Random Forest Forecast vs MLR (Train to ", train_end_year, ")"),
subtitle = paste0("MAE: ", round(accuracy_rf[1, "MAE"], 2),
" | RMSE: ", round(accuracy_rf[1, "RMSE"], 2),
" | MAPE: ", round(accuracy_rf[1, "MAPE"], 2), "%"),
x = "Year",
y = paste0(target_metric, " (Value)"),
color = "Type"
) +
scale_x_continuous(breaks = unique(df_cleaned$data_series)) +
scale_color_manual(values = c("Actual" = "blue", "Predicted" = "red")) +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, face = "bold"),
plot.subtitle = element_text(hjust = 0.5),
legend.position = "bottom",
axis.text.x = element_text(angle = 45, hjust = 1))
Definition of the metrics discussed are listed below:
MAE (Mean Absolute Error) is the average of the absolute differences between predicted and actual values. RMSE (Root Mean Squared Error) is defined as the square root of the average of the squared differences between predicted and actual values. MAPE (Mean Absolute Percentage Error) is defined as the average of the absolute percentage errors.
Comparison of Model Performance as per generated output
| Metric | Random Forest | Multiple Linear Regression (MLR) |
|---|---|---|
| MAE | 1201.79 | 658.26 |
| RMSE | 1561.59 | 793.41 |
| MAPE | 13.05% | 6.95% |
Multiple Linear Regression (MLR) model clearly outperformed the Random Forest model on the test set. Observations are listed as below:
MLR model achieved significantly lower MAE , RMSE stated fewer large errors and MAPE showing lesser error by 6.95% of the actual value.
By forecasting on the total mobile phone subscriptions, the outputs have strongly suggests other metrics have a higher correlation with number of mobile phone subscriptions.
Multiple Linear Regression model has demonstrates a much better accuracy on unseen data with it’s core relationship might be more linear than complex whereas Random Forest model struggles to capture the underlying patterns as effectively as a simpler linear model in this scenario.