Include libraries
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
library(ggplot2)
library("xlsx")
library("xlsxjars")
## Loading required package: rJava
Import Dataset
#Load dataset
file_path = "/Users/saraawad/Desktop/base_stations.csv";
d_base <- read.table(file=file_path, sep=",", stringsAsFactors = FALSE,
header=TRUE)
d_base$X <- NULL
Data Cleaning
#Convert column names to lower case
names(d_base) <- tolower(names(d_base))
str(d_base)
## 'data.frame': 103005 obs. of 10 variables:
## $ bts_id : int 5074914 5055395 5064215 5064217 5070617 5056817 5066712 5066401 5064176 5063407 ...
## $ mx_lat1 : int 41 39 37 37 39 39 37 40 40 40 ...
## $ mx_lat2 : int 29 33 56 54 54 12 57 46 45 45 ...
## $ mx_lat3 : num 23.5 17.9 24.5 38.9 51.4 ...
## $ mx_long1 : int 32 27 32 32 28 28 34 29 29 30 ...
## $ mx_long2 : int 5 41 29 31 9 5 41 46 53 1 ...
## $ mx_long3 : num 38.7 31.9 45.4 47.8 24.5 ...
## $ mx_sahail : chr "BARTIN" "BALIKESIR" "KONYA" "KONYA" ...
## $ mx_silce : chr "CAYCUMA" "ALTIEYLUL" "SELCUKLU" "KARATAY" ...
## $ mx_poparea: chr "" "RURAL" "SUB_URBAN" "INDUSTRIAL" ...
#Check for missing "NA" values in each column in this dataframe
colSums(is.na(d_base))
## bts_id mx_lat1 mx_lat2 mx_lat3 mx_long1 mx_long2
## 0 3880 3744 3646 3883 3743
## mx_long3 mx_sahail mx_silce mx_poparea
## 3650 0 0 0
#Count empty fields
sum(d_base$mx_sahail == "")
## [1] 1
sum(d_base$mx_silce == "")
## [1] 3728
#Columns such as mx_sahail, mx_silce have empty data
#Get unique length of all columns
apply(d_base, 2, function(x) length(unique(x)))
## bts_id mx_lat1 mx_lat2 mx_lat3 mx_long1 mx_long2
## 103005 19 62 5939 31 62
## mx_long3 mx_sahail mx_silce mx_poparea
## 5934 84 1022 238
#Check if number of rows equal to number of base station ids to make sure no duplicates
nrow(d_base) == unique(length(d_base$bts_id))
## [1] TRUE
#Convert all digits to NA since we are not concerned with digits
d_base$mx_sahail <- ifelse(grepl("[[:digit:]]", d_base$mx_sahail) == TRUE, NA, d_base$mx_sahail)
#Get count for each column mx_sahail
table(d_base$mx_sahail)
##
## ADANA ADIYAMAN AFYON AFYONKARAHISAR
## 1 2200 488 695 107
## AGRI AKSARAY AMASYA ANKARA ANTALYA
## 565 418 350 10636 4105
## ARDAHAN ARTVIN AYDIN BALIKESIR BARTIN
## 199 391 1191 1484 253
## BATMAN BAYBURT BILECIK BINGOL BITLIS
## 432 164 371 342 346
## BOLU BURDUR BURSA CANAKKALE CANKIRI
## 565 465 3200 797 327
## CORUM DENIZLI DIYARBAKIR DUZCE EDIRNE
## 617 1128 1296 384 543
## ELAZIG ERZINCAN ERZURUM ESKISEHIR GAZIANTEP
## 609 572 1306 1098 1503
## GIRESUN GUMUSHANE HAKKARI HATAY ICEL
## 629 311 313 1065 1729
## IGDIR ISPARTA ISTANBUL IZMIR KARABUK
## 244 594 23131 6418 332
## KARAMAN KARS KASTAMONU KAYSERI KILIS
## 352 460 520 1503 177
## KIRIKKALE KIRKLARELI KIRSEHIR KMARAS KOCAELI
## 353 464 346 854 2243
## KONYA KUTAHYA MALATYA MANISA MARDIN
## 2375 637 802 1228 610
## MUGLA MUS NEVSEHIR NIGDE ORDU
## 2146 348 435 353 763
## OSMANIYE RIZE SAKARYA SAMSUN SANLIURFA
## 456 510 961 1184 976
## SIIRT SINOP SIRNAK SIVAS TEKIRDAG
## 298 258 391 995 896
## TOKAT TRABZON TUNCELI USAK VAN
## 584 1430 181 402 932
## YALOVA YOZGAT ZONGULDAK
## 452 481 734
#Replace empty values and "-" by NA
d_base$mx_poparea <- ifelse(d_base$mx_poparea %in% c("", "-") , NA, d_base$mx_poparea)
d_base$mx_poparea <- toupper(d_base$mx_poparea)
General Statistics
Plotting