Include libraries

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringr)
library(ggplot2)
library("xlsx")
library("xlsxjars")
## Loading required package: rJava

Import Dataset

#Load dataset
file_path = "/Users/saraawad/Desktop/base_stations.csv";
d_base <- read.table(file=file_path, sep=",", stringsAsFactors = FALSE, 
                     header=TRUE)
d_base$X <- NULL

Data Cleaning

#Convert column names to lower case
names(d_base) <- tolower(names(d_base))

str(d_base)
## 'data.frame':    103005 obs. of  10 variables:
##  $ bts_id    : int  5074914 5055395 5064215 5064217 5070617 5056817 5066712 5066401 5064176 5063407 ...
##  $ mx_lat1   : int  41 39 37 37 39 39 37 40 40 40 ...
##  $ mx_lat2   : int  29 33 56 54 54 12 57 46 45 45 ...
##  $ mx_lat3   : num  23.5 17.9 24.5 38.9 51.4 ...
##  $ mx_long1  : int  32 27 32 32 28 28 34 29 29 30 ...
##  $ mx_long2  : int  5 41 29 31 9 5 41 46 53 1 ...
##  $ mx_long3  : num  38.7 31.9 45.4 47.8 24.5 ...
##  $ mx_sahail : chr  "BARTIN" "BALIKESIR" "KONYA" "KONYA" ...
##  $ mx_silce  : chr  "CAYCUMA" "ALTIEYLUL" "SELCUKLU" "KARATAY" ...
##  $ mx_poparea: chr  "" "RURAL" "SUB_URBAN" "INDUSTRIAL" ...
#Check for missing "NA" values in each column in this dataframe
colSums(is.na(d_base))
##     bts_id    mx_lat1    mx_lat2    mx_lat3   mx_long1   mx_long2 
##          0       3880       3744       3646       3883       3743 
##   mx_long3  mx_sahail   mx_silce mx_poparea 
##       3650          0          0          0
#Count empty fields
sum(d_base$mx_sahail == "")
## [1] 1
sum(d_base$mx_silce == "")
## [1] 3728
#Columns such as mx_sahail, mx_silce have empty data

#Get unique length of all columns
apply(d_base, 2, function(x) length(unique(x)))
##     bts_id    mx_lat1    mx_lat2    mx_lat3   mx_long1   mx_long2 
##     103005         19         62       5939         31         62 
##   mx_long3  mx_sahail   mx_silce mx_poparea 
##       5934         84       1022        238
#Check if number of rows equal to number of base station ids to make sure no duplicates
nrow(d_base) == unique(length(d_base$bts_id))
## [1] TRUE
#Convert all digits to NA since we are not concerned with digits
d_base$mx_sahail <- ifelse(grepl("[[:digit:]]", d_base$mx_sahail) == TRUE, NA, d_base$mx_sahail)

#Get count for each column mx_sahail
table(d_base$mx_sahail)
## 
##                         ADANA       ADIYAMAN          AFYON AFYONKARAHISAR 
##              1           2200            488            695            107 
##           AGRI        AKSARAY         AMASYA         ANKARA        ANTALYA 
##            565            418            350          10636           4105 
##        ARDAHAN         ARTVIN          AYDIN      BALIKESIR         BARTIN 
##            199            391           1191           1484            253 
##         BATMAN        BAYBURT        BILECIK         BINGOL         BITLIS 
##            432            164            371            342            346 
##           BOLU         BURDUR          BURSA      CANAKKALE        CANKIRI 
##            565            465           3200            797            327 
##          CORUM        DENIZLI     DIYARBAKIR          DUZCE         EDIRNE 
##            617           1128           1296            384            543 
##         ELAZIG       ERZINCAN        ERZURUM      ESKISEHIR      GAZIANTEP 
##            609            572           1306           1098           1503 
##        GIRESUN      GUMUSHANE        HAKKARI          HATAY           ICEL 
##            629            311            313           1065           1729 
##          IGDIR        ISPARTA       ISTANBUL          IZMIR        KARABUK 
##            244            594          23131           6418            332 
##        KARAMAN           KARS      KASTAMONU        KAYSERI          KILIS 
##            352            460            520           1503            177 
##      KIRIKKALE     KIRKLARELI       KIRSEHIR         KMARAS        KOCAELI 
##            353            464            346            854           2243 
##          KONYA        KUTAHYA        MALATYA         MANISA         MARDIN 
##           2375            637            802           1228            610 
##          MUGLA            MUS       NEVSEHIR          NIGDE           ORDU 
##           2146            348            435            353            763 
##       OSMANIYE           RIZE        SAKARYA         SAMSUN      SANLIURFA 
##            456            510            961           1184            976 
##          SIIRT          SINOP         SIRNAK          SIVAS       TEKIRDAG 
##            298            258            391            995            896 
##          TOKAT        TRABZON        TUNCELI           USAK            VAN 
##            584           1430            181            402            932 
##         YALOVA         YOZGAT      ZONGULDAK 
##            452            481            734
#Replace empty values and "-" by NA
d_base$mx_poparea <- ifelse(d_base$mx_poparea %in% c("", "-") , NA, d_base$mx_poparea)
d_base$mx_poparea <- toupper(d_base$mx_poparea)

General Statistics

Plotting