library(readr)
library(stats)
library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ stringr 1.5.0
## ✔ forcats 1.0.0 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(glmnet)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Loaded glmnet 4.1-8
library(lmtest)
## Warning: package 'lmtest' was built under R version 4.3.2
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.3.2
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
data <- read.csv("C:/Users/prase/OneDrive/Documents/STATISTICS/signal_metrics.csv")
head(data)
## Timestamp Locality Latitude Longitude SignalStrength DataThroughput
## 1 51:30.7 Danapur 25.42617 85.09443 -76.72446 1.105452
## 2 23:56.4 Bankipore 25.59105 85.25081 -77.52335 2.476287
## 3 24:39.7 Ashok Rajpath 25.48233 85.14868 -78.55790 1.031408
## 4 02:26.4 Rajendra Nagar 25.46116 85.23826 -78.77064 1.461008
## 5 32:12.7 Ashok Rajpath 25.61583 85.10455 -77.27129 1.792531
## 6 58:31.2 Rajendra Nagar 25.56698 85.12149 -75.67285 2.572450
## Latency NetworkType BB60C srsRAN BladeRFxA9
## 1 138.9383 LTE -72.50342 -84.97208 -75.12779
## 2 137.6606 LTE -73.45848 -84.77590 -77.94294
## 3 165.4447 LTE -73.88210 -84.76128 -77.21692
## 4 101.6800 LTE -74.04047 -87.27312 -77.86791
## 5 177.4726 LTE -74.08004 -85.93112 -75.57369
## 6 131.5178 LTE -74.66450 -85.16332 -74.51283
str(data)
## 'data.frame': 12621 obs. of 11 variables:
## $ Timestamp : chr "51:30.7" "23:56.4" "24:39.7" "02:26.4" ...
## $ Locality : chr "Danapur" "Bankipore" "Ashok Rajpath" "Rajendra Nagar" ...
## $ Latitude : num 25.4 25.6 25.5 25.5 25.6 ...
## $ Longitude : num 85.1 85.3 85.1 85.2 85.1 ...
## $ SignalStrength: num -76.7 -77.5 -78.6 -78.8 -77.3 ...
## $ DataThroughput: num 1.11 2.48 1.03 1.46 1.79 ...
## $ Latency : num 139 138 165 102 177 ...
## $ NetworkType : chr "LTE" "LTE" "LTE" "LTE" ...
## $ BB60C : num -72.5 -73.5 -73.9 -74 -74.1 ...
## $ srsRAN : num -85 -84.8 -84.8 -87.3 -85.9 ...
## $ BladeRFxA9 : num -75.1 -77.9 -77.2 -77.9 -75.6 ...
summary(data)
## Timestamp Locality Latitude Longitude
## Length:12621 Length:12621 Min. :25.41 Min. :84.96
## Class :character Class :character 1st Qu.:25.52 1st Qu.:85.07
## Mode :character Mode :character Median :25.59 Median :85.14
## Mean :25.59 Mean :85.14
## 3rd Qu.:25.67 3rd Qu.:85.21
## Max. :25.77 Max. :85.32
## SignalStrength DataThroughput Latency NetworkType
## Min. :-116.94 Min. : 1.001 Min. : 10.02 Length:12621
## 1st Qu.: -94.88 1st Qu.: 2.492 1st Qu.: 39.96 Class :character
## Median : -91.41 Median : 6.463 Median : 75.21 Mode :character
## Mean : -91.76 Mean :20.909 Mean : 85.28
## 3rd Qu.: -88.34 3rd Qu.:31.504 3rd Qu.:125.96
## Max. : -74.64 Max. :99.986 Max. :199.99
## BB60C srsRAN BladeRFxA9
## Min. :-115.67 Min. :-124.65 Min. :-119.21
## 1st Qu.: -95.49 1st Qu.:-102.55 1st Qu.: -95.17
## Median : -91.60 Median : -98.96 Median : -91.46
## Mean : -91.77 Mean : -99.26 Mean : -91.77
## 3rd Qu.: -87.79 3rd Qu.: -95.67 3rd Qu.: -88.15
## Max. : -72.50 Max. : -81.32 Max. : -74.51
Explanatory variable- SignalStrength
Response variable- DataThroughput
model <- lm(DataThroughput ~ SignalStrength, data = data)
summary(model)
##
## Call:
## lm(formula = DataThroughput ~ SignalStrength, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -50.782 -16.758 -8.026 6.778 100.446
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -195.59727 4.23774 -46.16 <2e-16 ***
## SignalStrength -2.35949 0.04612 -51.16 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 25.6 on 12619 degrees of freedom
## Multiple R-squared: 0.1718, Adjusted R-squared: 0.1717
## F-statistic: 2618 on 1 and 12619 DF, p-value: < 2.2e-16
ggplot(data, aes(x = SignalStrength, y = DataThroughput)) +
geom_point() +
geom_smooth(method = "lm", formula = y ~ x, se = FALSE, color = "blue") +
labs(title = "Linear Regression: DataThroughput vs. SignalStrength",
x = "Signal Strength",
y = "Data Throughput")
The blue line on the plot represents the linear regression model’s fitted line, showing the trend. As “SignalStrength” increases, there is a general upward trend in “DataThroughput.” This suggests that higher signal strength is associated with higher data transfer speeds.
residuals <- residuals(model)
plot(model, which = 1)
qqnorm(residuals)
qqline(residuals)
In the Q-Q plot, if the points closely follow the diagonal line, it suggests that the residuals are approximately normally distributed, which is a key assumption of linear regression.
hist(residuals, breaks = 20, main = "Residuals Histogram")
If the histogram is approximately bell-shaped and symmetrical, it indicates that the residuals are normally distributed, which is another indicator of the normality assumption in linear regression.
bptest(model)
##
## studentized Breusch-Pagan test
##
## data: model
## BP = 483.37, df = 1, p-value < 2.2e-16
outliers <- cooks.distance(model) > 4 / length(data$DataThroughput)
outlier_indices <- which(outliers)
head(data[outlier_indices, ])
## Timestamp Locality Latitude Longitude SignalStrength DataThroughput
## 10 58:53.6 Kumhrar 25.53426 85.02893 -74.64485 2.049014
## 22 59:56.7 Gardanibagh 25.45433 85.13229 -79.19649 91.712154
## 58 04:12.6 Bankipore 25.61948 85.28110 -78.31335 78.630808
## 68 53:09.9 Gardanibagh 25.69303 85.05566 -83.02354 67.156948
## 88 06:14.1 Exhibition Road 25.52286 85.09268 -83.48749 95.654632
## 99 50:38.1 Danapur 25.59663 85.25939 -77.15572 75.680627
## Latency NetworkType BB60C srsRAN BladeRFxA9
## 10 148.26785 LTE -75.38481 -81.32009 -76.43542
## 22 29.93184 5G -76.25924 -87.54305 -76.45113
## 58 14.98378 5G -78.04155 -83.42197 -78.63307
## 68 45.19269 5G -78.18412 -90.77549 -80.27265
## 88 28.05698 5G -78.62111 -93.17974 -81.91285
## 99 13.07681 5G -78.85122 -83.43214 -77.50387
cat("R-squared:", summary(model)$r.squared, "\n")
## R-squared: 0.1718063
cat("Adjusted R-squared:", summary(model)$adj.r.squared, "\n")
## Adjusted R-squared: 0.1717406
By performing these diagnostic procedures, we can identify potential issues with the model. Common issues include heteroscedasticity, non-normality of residuals, multicollinearity, and outliers. Depending on the issues identified, we may need to consider model refinement, transformation of variables, or data preprocessing to improve the model’s performance.
summary(model)
##
## Call:
## lm(formula = DataThroughput ~ SignalStrength, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -50.782 -16.758 -8.026 6.778 100.446
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -195.59727 4.23774 -46.16 <2e-16 ***
## SignalStrength -2.35949 0.04612 -51.16 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 25.6 on 12619 degrees of freedom
## Multiple R-squared: 0.1718, Adjusted R-squared: 0.1717
## F-statistic: 2618 on 1 and 12619 DF, p-value: < 2.2e-16
The coefficient for “SignalStrength” is -2.35949. This means that for each one-unit increase in “SignalStrength,” the “DataThroughput” is expected to increase by approximately -2.35949 units, assuming all other factors remain constant.
Additionally, the t-value of -51.16 and the associated p-value of 0.05 indicate that the coefficient for “SignalStrength” is statistically significant. This suggests that there is a significant relationship between “SignalStrength” and “DataThroughput” in the context of the model.
The residual plot shows the spread of residuals around zero. If the residuals are randomly scattered with no apparent pattern, it suggests that the linear regression assumptions are met.
The normality of residuals is assessed through a QQ plot and histogram. Deviations from a normal distribution may indicate issues with model assumptions.
A scale-location plot shows the spread of residuals across fitted values. Homoscedasticity means that the variance of residuals is constant across the range of fitted values.
Diagnostic plots, Cook’s distance, and leverage plots help identify potential outliers and influential data points.
R-squared represents the proportion of variance explained by the model. A high R-squared indicates a good fit.
Model assumptions, including linearity, independence of errors, and constant variance, should be assessed.