R Markdown

Importing libraries

library(readr)
library(stats)
library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ stringr   1.5.0
## ✔ forcats   1.0.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(glmnet)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## Loaded glmnet 4.1-8

Load and Explore Data

data <- read.csv("C:/Users/prase/OneDrive/Documents/STATISTICS/signal_metrics.csv")
head(data)
##   Timestamp       Locality Latitude Longitude SignalStrength DataThroughput
## 1   51:30.7        Danapur 25.42617  85.09443      -76.72446       1.105452
## 2   23:56.4      Bankipore 25.59105  85.25081      -77.52335       2.476287
## 3   24:39.7  Ashok Rajpath 25.48233  85.14868      -78.55790       1.031408
## 4   02:26.4 Rajendra Nagar 25.46116  85.23826      -78.77064       1.461008
## 5   32:12.7  Ashok Rajpath 25.61583  85.10455      -77.27129       1.792531
## 6   58:31.2 Rajendra Nagar 25.56698  85.12149      -75.67285       2.572450
##    Latency NetworkType     BB60C    srsRAN BladeRFxA9
## 1 138.9383         LTE -72.50342 -84.97208  -75.12779
## 2 137.6606         LTE -73.45848 -84.77590  -77.94294
## 3 165.4447         LTE -73.88210 -84.76128  -77.21692
## 4 101.6800         LTE -74.04047 -87.27312  -77.86791
## 5 177.4726         LTE -74.08004 -85.93112  -75.57369
## 6 131.5178         LTE -74.66450 -85.16332  -74.51283
str(data)
## 'data.frame':    12621 obs. of  11 variables:
##  $ Timestamp     : chr  "51:30.7" "23:56.4" "24:39.7" "02:26.4" ...
##  $ Locality      : chr  "Danapur" "Bankipore" "Ashok Rajpath" "Rajendra Nagar" ...
##  $ Latitude      : num  25.4 25.6 25.5 25.5 25.6 ...
##  $ Longitude     : num  85.1 85.3 85.1 85.2 85.1 ...
##  $ SignalStrength: num  -76.7 -77.5 -78.6 -78.8 -77.3 ...
##  $ DataThroughput: num  1.11 2.48 1.03 1.46 1.79 ...
##  $ Latency       : num  139 138 165 102 177 ...
##  $ NetworkType   : chr  "LTE" "LTE" "LTE" "LTE" ...
##  $ BB60C         : num  -72.5 -73.5 -73.9 -74 -74.1 ...
##  $ srsRAN        : num  -85 -84.8 -84.8 -87.3 -85.9 ...
##  $ BladeRFxA9    : num  -75.1 -77.9 -77.2 -77.9 -75.6 ...
summary(data)
##   Timestamp           Locality            Latitude       Longitude    
##  Length:12621       Length:12621       Min.   :25.41   Min.   :84.96  
##  Class :character   Class :character   1st Qu.:25.52   1st Qu.:85.07  
##  Mode  :character   Mode  :character   Median :25.59   Median :85.14  
##                                        Mean   :25.59   Mean   :85.14  
##                                        3rd Qu.:25.67   3rd Qu.:85.21  
##                                        Max.   :25.77   Max.   :85.32  
##  SignalStrength    DataThroughput      Latency       NetworkType       
##  Min.   :-116.94   Min.   : 1.001   Min.   : 10.02   Length:12621      
##  1st Qu.: -94.88   1st Qu.: 2.492   1st Qu.: 39.96   Class :character  
##  Median : -91.41   Median : 6.463   Median : 75.21   Mode  :character  
##  Mean   : -91.76   Mean   :20.909   Mean   : 85.28                     
##  3rd Qu.: -88.34   3rd Qu.:31.504   3rd Qu.:125.96                     
##  Max.   : -74.64   Max.   :99.986   Max.   :199.99                     
##      BB60C             srsRAN          BladeRFxA9     
##  Min.   :-115.67   Min.   :-124.65   Min.   :-119.21  
##  1st Qu.: -95.49   1st Qu.:-102.55   1st Qu.: -95.17  
##  Median : -91.60   Median : -98.96   Median : -91.46  
##  Mean   : -91.77   Mean   : -99.26   Mean   : -91.77  
##  3rd Qu.: -87.79   3rd Qu.: -95.67   3rd Qu.: -88.15  
##  Max.   : -72.50   Max.   : -81.32   Max.   : -74.51

Select a binary column

I created a binary column “GoodSignal” from the “SignalStrength” Column by providing a threshold value.

summary(data$SignalStrength)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -116.94  -94.88  -91.41  -91.76  -88.34  -74.64
threshold <- -91
data$GoodSignal <- ifelse(data$SignalStrength > threshold, 1, 0)
head(data,50)
##    Timestamp          Locality Latitude Longitude SignalStrength DataThroughput
## 1    51:30.7           Danapur 25.42617  85.09443      -76.72446       1.105452
## 2    23:56.4         Bankipore 25.59105  85.25081      -77.52335       2.476287
## 3    24:39.7     Ashok Rajpath 25.48233  85.14868      -78.55790       1.031408
## 4    02:26.4    Rajendra Nagar 25.46116  85.23826      -78.77064       1.461008
## 5    32:12.7     Ashok Rajpath 25.61583  85.10455      -77.27129       1.792531
## 6    58:31.2    Rajendra Nagar 25.56698  85.12149      -75.67285       2.572450
## 7    05:10.6          Anisabad 25.64325  85.22548      -79.78586       7.055229
## 8    25:02.5       Fraser Road 25.50409  85.15215      -79.56979       2.990245
## 9    01:41.0         Anandpuri 25.71732  85.00773      -79.62299       2.714386
## 10   58:53.6           Kumhrar 25.53426  85.02893      -74.64485       2.049014
## 11   48:14.1       Pataliputra 25.73458  85.24577      -79.41169       1.963675
## 12   19:58.4         Bankipore 25.49777  85.24539      -78.54243       4.803491
## 13   43:07.5          Anisabad 25.46295  85.09027      -78.54834       5.444352
## 14   54:45.6    Rajendra Nagar 25.58205  85.14008      -76.37918       1.487846
## 15   55:11.4       Gardanibagh 25.53767  85.21657      -79.47539       2.494880
## 16   08:51.1       Bailey Road 25.45919  85.13344      -78.70136       4.036939
## 17   19:51.3   Exhibition Road 25.73730  85.20469      -80.37612       5.560403
## 18   45:27.0     Gandhi Maidan 25.43104  85.15088      -80.45239       7.751439
## 19   44:46.5           Danapur 25.62924  85.25111      -79.63338       7.567241
## 20   55:23.6 Boring Canal Road 25.66722  85.18380      -79.52348       4.408633
## 21   08:32.9 Patliputra Colony 25.62467  85.24504      -79.75668       2.885384
## 22   59:56.7       Gardanibagh 25.45433  85.13229      -79.19649      91.712154
## 23   58:38.5         Bankipore 25.46229  85.23099      -78.74127       5.053948
## 24   31:13.9           Kumhrar 25.69694  85.00577      -80.39014       2.647859
## 25   11:33.0        Kankarbagh 25.53632  84.98680      -81.04179       6.567972
## 26   34:44.0       Boring Road 25.53574  85.29738      -79.08490       1.771200
## 27   38:36.7   Exhibition Road 25.47124  85.07854      -79.05149      17.014677
## 28   28:24.4        Kankarbagh 25.58727  85.09570      -81.22762       6.749505
## 29   04:22.7           Danapur 25.53218  85.28254      -81.60143       2.093142
## 30   25:58.1           Danapur 25.53095  85.08731      -80.06807       5.919178
## 31   56:44.3 Boring Canal Road 25.74663  85.14328      -81.65836       4.149295
## 32   49:29.8        Kankarbagh 25.63683  85.25327      -80.33459       6.335099
## 33   11:10.4     Ashok Rajpath 25.59772  85.27217      -81.66899       1.616805
## 34   43:05.4         Bankipore 25.58512  85.12152      -81.10474       5.736821
## 35   09:46.1   Phulwari Sharif 25.56541  85.01148      -80.74996       9.245613
## 36   37:45.9       Fraser Road 25.70434  85.04780      -81.66115       8.389815
## 37   32:55.4   Phulwari Sharif 25.73233  85.03220      -80.99253       1.685927
## 38   42:34.9   Exhibition Road 25.44588  85.07742      -81.54229       9.877065
## 39   17:44.8           Kumhrar 25.50266  84.99563      -80.69694       1.392717
## 40   40:22.8           Danapur 25.47281  85.08097      -82.08078       2.846367
## 41   48:26.2         Bankipore 25.58000  85.11384      -80.77411       9.607038
## 42   37:23.5   Exhibition Road 25.75129  85.18701      -82.42370       1.948491
## 43   14:34.8           Kumhrar 25.61121  85.04688      -79.85168       2.188680
## 44   33:18.1 Patliputra Colony 25.74947  85.21585      -82.46717       2.131396
## 45   57:22.4       Fraser Road 25.51345  85.28793      -82.00365       1.418214
## 46   27:04.0     Gandhi Maidan 25.76242  85.17402      -80.90676       1.379695
## 47   39:12.7       Boring Road 25.57070  85.25366      -81.78015       1.233512
## 48   19:28.1           Danapur 25.71364  85.17703      -80.59119       2.550639
## 49   16:41.4         Anandpuri 25.57993  85.19552      -82.00665       9.601905
## 50   04:27.5 Patliputra Colony 25.74304  85.07968      -79.10144       5.409707
##      Latency NetworkType     BB60C    srsRAN BladeRFxA9 GoodSignal
## 1  138.93828         LTE -72.50342 -84.97208  -75.12779          1
## 2  137.66062         LTE -73.45848 -84.77590  -77.94294          1
## 3  165.44468         LTE -73.88210 -84.76128  -77.21692          1
## 4  101.68002         LTE -74.04047 -87.27312  -77.86791          1
## 5  177.47257         LTE -74.08004 -85.93112  -75.57369          1
## 6  131.51783         LTE -74.66450 -85.16332  -74.51283          1
## 7   80.06226          4G -74.87395 -88.32040  -80.94109          1
## 8  196.70610         LTE -75.20550 -84.81174  -77.65664          1
## 9  176.08350         LTE -75.24133 -86.50868  -81.01483          1
## 10 148.26785         LTE -75.38481 -81.32009  -76.43542          1
## 11 173.61780         LTE -75.55510 -87.23246  -79.51853          1
## 12  66.28832          4G -75.57373 -84.19625  -79.59106          1
## 13  95.61426          4G -75.58124 -86.49970  -77.42733          1
## 14 127.23123         LTE -75.70932 -81.98091  -76.68469          1
## 15 173.12382         LTE -75.77120 -84.70266  -76.48276          1
## 16  73.33284          4G -75.79857 -85.57995  -79.23622          1
## 17  51.98722          4G -75.87241 -89.24276  -82.59162          1
## 18  75.80653          4G -75.95761 -89.55060  -80.25780          1
## 19  93.82124          4G -76.01485 -86.44516  -80.37899          1
## 20  82.72454          4G -76.13552 -88.52299  -82.17284          1
## 21 147.21622         LTE -76.18193 -85.50319  -77.43203          1
## 22  29.93184          5G -76.25924 -87.54305  -76.45113          1
## 23  61.02337          4G -76.38739 -88.44806  -75.90696          1
## 24 164.41264         LTE -76.45485 -90.35892  -80.98320          1
## 25  65.02414          4G -76.63643 -86.67732  -79.91372          1
## 26 182.29193         LTE -76.66324 -88.82760  -80.88318          1
## 27  18.02683          5G -76.75894 -84.10311  -76.95299          1
## 28  72.99390          4G -76.75943 -89.33791  -81.28751          1
## 29 182.09215         LTE -76.80326 -86.81165  -79.96883          1
## 30  95.08057          4G -76.85047 -88.66085  -78.15180          1
## 31  56.17318          4G -76.85784 -89.68976  -83.29383          1
## 32  92.19727          4G -76.89185 -87.44430  -82.59454          1
## 33 115.07861         LTE -76.93213 -87.77232  -79.90982          1
## 34  53.20048          4G -76.97834 -88.49144  -83.97862          1
## 35  60.52121          4G -76.99054 -86.07749  -83.21303          1
## 36  55.19797          4G -77.03750 -87.21004  -83.03906          1
## 37 159.03922         LTE -77.11803 -90.62727  -82.10897          1
## 38  89.30251          4G -77.23280 -89.18233  -81.69675          1
## 39 163.97185         LTE -77.34661 -87.81753  -78.85530          1
## 40 173.38548         LTE -77.46782 -89.83772  -81.91104          1
## 41  97.14400          4G -77.47208 -89.44943  -78.96597          1
## 42 166.91099         LTE -77.49886 -87.98487  -81.11721          1
## 43 143.98176         LTE -77.52306 -86.76499  -81.02427          1
## 44 197.11201         LTE -77.52908 -92.38100  -82.37504          1
## 45 114.95091         LTE -77.60146 -87.09757  -81.71829          1
## 46 135.31408         LTE -77.62841 -90.30810  -83.89171          1
## 47 110.18800         LTE -77.66142 -88.60098  -81.34927          1
## 48 189.91804         LTE -77.70028 -87.05853  -79.85920          1
## 49  88.40060          4G -77.72962 -89.63120  -84.01149          1
## 50  52.75487          4G -77.77540 -84.31198  -78.62352          1
count <- table(data$GoodSignal)
count
## 
##    0    1 
## 6722 5899

Build a logistic regression model

Explanatory Variable: DataThroughput

model <- glm(GoodSignal ~ DataThroughput, data = data, family = binomial(link = "logit"))
summary(model)
## 
## Call:
## glm(formula = GoodSignal ~ DataThroughput, family = binomial(link = "logit"), 
##     data = data)
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     0.3841369  0.0228389   16.82   <2e-16 ***
## DataThroughput -0.0276248  0.0008373  -32.99   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 17443  on 12620  degrees of freedom
## Residual deviance: 16021  on 12619  degrees of freedom
## AIC: 16025
## 
## Number of Fisher Scoring iterations: 3

Interpretation of the coefficients

The intercept represents the log-odds when “DataThroughput” is zero, and the coefficient for “DataThroughput” indicates the change in log-odds for a one-unit increase in “DataThroughput.”

Intercept:

Estimate: 0.3841369, Std. Error: 0.0228389, Z value: 16.82, Pr(>|z|): <2e-16

The intercept represents the log-odds of the “GoodSignal” when “DataThroughput” is zero. In other words, when “DataThroughput” is zero, the estimated log-odds of having a “GoodSignal” are 0.3841369.

Coefficient for DataThroughput:

Estimate: -0.0276248, Std. Error: 0.0008373 Z value: -32.99, Pr(>|z|): <2e-16

The coefficient for “DataThroughput” represents how a one-unit increase in “DataThroughput” affects the log-odds of having a “GoodSignal” while holding other variables constant. In this case, for each one-unit increase in “DataThroughput,” the log-odds of having a “GoodSignal” increase by -0.0276248.

Calculating Confidence Interval

conf_interval <- confint(model, parm = "DataThroughput")
## Waiting for profiling to be done...
conf_interval
##       2.5 %      97.5 % 
## -0.02928140 -0.02599818
ci <- confint(model, parm = "DataThroughput", level = 0.95)
## Waiting for profiling to be done...
coef_DataThroughput <- coef(model)["DataThroughput"]
se_DataThroughput <- sqrt(vcov(model)["DataThroughput", "DataThroughput"])


lower_bound <- coef_DataThroughput - 1.96 * se_DataThroughput 
upper_bound <- coef_DataThroughput + 1.96 * se_DataThroughput

cat("Coefficient (DataThroughput):", coef_DataThroughput, "\n")
## Coefficient (DataThroughput): -0.02762481
cat("Standard Error (SE):", se_DataThroughput, "\n")
## Standard Error (SE): 0.0008373393
cat("95% Confidence Interval (CI): [", lower_bound, ",", upper_bound, "]\n")
## 95% Confidence Interval (CI): [ -0.029266 , -0.02598363 ]
  • The coefficient estimate provides insight into the strength and direction of the relationship between “DataThroughput” and the probability of a “GoodSignal.”

  • The standard error quantifies the uncertainty associated with the coefficient estimate.

  • The 95% confidence interval offers a range of values where the true coefficient is likely to lie, with a 95% level of confidence.

data %>%
  ggplot(mapping = aes(x =DataThroughput, y = GoodSignal)) +
  geom_jitter(width = 0, height = 0.1, shape = 'O', size = 3) +
  geom_smooth(method = 'lm', se = FALSE) +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

Transformation for any explanatory variable

ggplot(data, aes(x = DataThroughput, y = GoodSignal)) +
  geom_point() +
  geom_smooth(method = "glm", formula = y ~ poly(x, 2), se = FALSE) +
  labs(title = "Scatterplot of DataThroughput vs. GoodSignal",
       x = "DataThroughput",
       y = "GoodSignal") +
  theme_minimal()

In this transformed plot, the logistic regression line is a curve that fits the data more closely. This transformation allows the model to capture the potential nonlinear relationship between “DataThroughput” and the probability of a “GoodSignal.” It provides a better representation of the underlying trend in the data and can lead to a more accurate model.

data <- data %>%
  mutate(log_DataThroughput = log(DataThroughput))

model <- lm(GoodSignal ~ log_DataThroughput, data = data)

rsquared <- summary(model)$r.squared

data %>%
  ggplot(mapping = aes(x = log_DataThroughput, y = GoodSignal)) +
  geom_point() +
  geom_smooth(method = 'lm', color = 'gray', linetype = 'dashed', se = FALSE) +
  labs(
    title = "Relationship between log_DataThroughput and GoodSignal",
    subtitle = paste("Linear Fit R-Squared =", round(rsquared, 3))
  ) +
  theme_classic()
## `geom_smooth()` using formula = 'y ~ x'

In our model, the positive coefficient for “DataThroughput” suggests that higher data throughput is associated with an increased likelihood of a “GoodSignal.” The polynomial transformation allows the model to capture the curvature in the data, providing a more accurate representation of the underlying trend. Confidence intervals are vital for assessing the precision and significance of a coefficient estimate. If the interval does not include zero, it suggests that the variable is statistically significant in predicting the outcome.