1.How we load dataset?

For loading data sets first we have to download Package “dslabs” then we run code

library(dslabs)

## Warning: package 'dslabs' was built under R version 4.2.3

data("murders")

2.Store the state names in a vectors?

state<-c(murders$state)
state

##  [1] "Alabama"              "Alaska"               "Arizona"             
##  [4] "Arkansas"             "California"           "Colorado"            
##  [7] "Connecticut"          "Delaware"             "District of Columbia"
## [10] "Florida"              "Georgia"              "Hawaii"              
## [13] "Idaho"                "Illinois"             "Indiana"             
## [16] "Iowa"                 "Kansas"               "Kentucky"            
## [19] "Louisiana"            "Maine"                "Maryland"            
## [22] "Massachusetts"        "Michigan"             "Minnesota"           
## [25] "Mississippi"          "Missouri"             "Montana"             
## [28] "Nebraska"             "Nevada"               "New Hampshire"       
## [31] "New Jersey"           "New Mexico"           "New York"            
## [34] "North Carolina"       "North Dakota"         "Ohio"                
## [37] "Oklahoma"             "Oregon"               "Pennsylvania"        
## [40] "Rhode Island"         "South Carolina"       "South Dakota"        
## [43] "Tennessee"            "Texas"                "Utah"                
## [46] "Vermont"              "Virginia"             "Washington"          
## [49] "West Virginia"        "Wisconsin"            "Wyoming"

3.Store the state abberivation in a vectors?

abb<-c(murders$abb)
abb

##  [1] "AL" "AK" "AZ" "AR" "CA" "CO" "CT" "DE" "DC" "FL" "GA" "HI" "ID" "IL" "IN"
## [16] "IA" "KS" "KY" "LA" "ME" "MD" "MA" "MI" "MN" "MS" "MO" "MT" "NE" "NV" "NH"
## [31] "NJ" "NM" "NY" "NC" "ND" "OH" "OK" "OR" "PA" "RI" "SC" "SD" "TN" "TX" "UT"
## [46] "VT" "VA" "WA" "WV" "WI" "WY"

4.What is the total population of USA?

total_population_usa<-sum(murders$population)
total_population_usa

## [1] 309864228

5.how many murders have been done according to this dataset

total_murders<-sum(murders$total)
total_murders

## [1] 9403

6.which state has highest population?

highest_popuation_state<-murders$state[which.max(murders$population)]
highest_popuation_state

## [1] "California"

7.which state has highest no of murders?

highest_no_of_murders<-murders$state[which.max(murders$total)]
highest_no_of_murders

## [1] "California"

8.which state has lowest no of murders?

lowest_no_of_murders<-murders$state[which.min(murders$total)]
lowest_no_of_murders

## [1] "Vermont"

9.which state has lowest no of population?

lowest_no_of_population<-murders$state[which.min(murders$population)]
lowest_no_of_population

## [1] "Wyoming"

10.Display population of Washington?

washington_population<-murders$population[murders$state=='Washington']
cat(washington_population)

## 6724540

11.Display no of murders at Alaska?

no_of_murders_at_Alaska<-murders$total [murders$state=='Alaska'
] 
no_of_murders_at_Alaska

## [1] 19

12.compute correlation between population and number of murders.

Correlation:

      Correlation is a statistical measure that describes the degree to which two or more variables are related or associated with each other. It quantifies the strength and direction of the linear relationship between these variables.

Formula:

\[ r = \frac{{\sum{(x_i - \bar{x})(y_i - \bar{y})}}}{{\sqrt{\sum{(x_i - \bar{x})^2} \cdot \sum{(y_i - \bar{y})^2}}}} \]

Calculation

Create vectors

x<- c(murders$population)
x

##  [1]  4779736   710231  6392017  2915918 37253956  5029196  3574097   897934
##  [9]   601723 19687653  9920000  1360301  1567582 12830632  6483802  3046355
## [17]  2853118  4339367  4533372  1328361  5773552  6547629  9883640  5303925
## [25]  2967297  5988927   989415  1826341  2700551  1316470  8791894  2059179
## [33] 19378102  9535483   672591 11536504  3751351  3831074 12702379  1052567
## [41]  4625364   814180  6346105 25145561  2763885   625741  8001024  6724540
## [49]  1852994  5686986   563626

y<- c(murders$total)
y

##  [1]  135   19  232   93 1257   65   97   38   99  669  376    7   12  364  142
## [16]   21   63  116  351   11  293  118  413   53  120  321   12   32   84    5
## [31]  246   67  517  286    4  310  111   36  457   16  207    8  219  805   22
## [46]    2  250   93   27   97    5

Multiplication of vectors

xy <- x * y
xy

##  [1]   645264360    13494389  1482947944   271180374 46828222692   326897740
##  [7]   346687409    34121492    59570577 13171039857  3729920000     9522107
## [13]    18810984  4670350048   920699884    63973455   179746434   503366572
## [19]  1591213572    14611971  1691650736   772620222  4081943320   281108025
## [25]   356075640  1922445567    11872980    58442912   226846284     6582350
## [31]  2162805924   137964993 10018478734  2727148138     2690364  3576316240
## [37]   416399961   137918664  5804987203    16841072   957450348     6513440
## [43]  1389796995 20242176605    60805470     1251482  2000256000   625382220
## [49]    50030838   551637642     2818130

Sum of xy

sum_xy<-sum(xy)
sum_xy

## [1] 135180900360

Length multiply with the multiplications of vectors

n<-length(x)
n

## [1] 51

n*(sum_xy)

## [1] 6.894226e+12

Sum x

sum_x<-sum(x)
sum_x

## [1] 309864228

Sum y

sum_y<-sum(y)

sum_y

## [1] 9403

Sum of x * sum of y

sum_x*sum_y

## [1] 2.913653e+12

Numerator

num<-n*(sum_xy)-(sum_x)*(sum_y)
num

## [1] 3.980573e+12

Square of x

x_sq<-x^2
x_sq

##  [1] 2.284588e+13 5.044281e+11 4.085788e+13 8.502578e+12 1.387857e+15
##  [6] 2.529281e+13 1.277417e+13 8.062855e+11 3.620706e+11 3.876037e+14
## [11] 9.840640e+13 1.850419e+12 2.457313e+12 1.646251e+14 4.203969e+13
## [16] 9.280279e+12 8.140282e+12 1.883011e+13 2.055146e+13 1.764543e+12
## [21] 3.333390e+13 4.287145e+13 9.768634e+13 2.813162e+13 8.804851e+12
## [26] 3.586725e+13 9.789420e+11 3.335521e+12 7.292976e+12 1.733093e+12
## [31] 7.729740e+13 4.240218e+12 3.755108e+14 9.092544e+13 4.523787e+11
## [36] 1.330909e+14 1.407263e+13 1.467713e+13 1.613504e+14 1.107897e+12
## [41] 2.139399e+13 6.628891e+11 4.027305e+13 6.322992e+14 7.639060e+12
## [46] 3.915518e+11 6.401639e+13 4.521944e+13 3.433587e+12 3.234181e+13
## [51] 3.176743e+11

Sum of square of x

sum_x_sq<- sum(x_sq)
sum_x_sq

## [1] 4.236103e+15

Multiply length of sum of sqyare of x

n*sum_x_sq

## [1] 2.160412e+17

Square of sum x

sq_sum_x<-(sum_x)^2
sq_sum_x

## [1] 9.601584e+16

Num

(n*sum_x_sq)-(sq_sum_x)

## [1] 1.200254e+17

Square of y

y_sq<-y^2
y_sq

##  [1]   18225     361   53824    8649 1580049    4225    9409    1444    9801
## [10]  447561  141376      49     144  132496   20164     441    3969   13456
## [19]  123201     121   85849   13924  170569    2809   14400  103041     144
## [28]    1024    7056      25   60516    4489  267289   81796      16   96100
## [37]   12321    1296  208849     256   42849      64   47961  648025     484
## [46]       4   62500    8649     729    9409      25

Sum of square of y

sum_y_sq<-sum(y_sq)
sum_y_sq

## [1] 4521433

Length of sum of y

n*sum_y_sq

## [1] 230593083

Square of sum of y

sq_sum_y<-(sum_y)^2
sq_sum_y

## [1] 88416409

(n*sum_y_sq)-(sq_sum_y)

## [1] 142176674

Denominator

deno<-((n*sum_x_sq)-(sq_sum_x))*((n*sum_y_sq)-(sq_sum_y))
deno

## [1] 1.706481e+25

sq root of denominator

sq_root_deno<-deno^0.5
sq_root_deno

## [1] 4.130958e+12

r<-num/sq_root_deno
r

## [1] 0.9635956

Alternate way of Correlation

cor(x,y)

## [1] 0.9635956

Interpreation:

 The correlation of population on the number of murders in the dataset of murders in R is 0.96. This means that there is a strong positive correlation between the two variables. This means that as the population of a state increases, the number of murders in that state also tends to increase........

13.Fit regression line of population on number of murders.

Regression line:

A regression line, also known as a "best-fit line" or a regression equation, is a straight line that represents the relationship between two variables in a statistical analysis, typically in the context of linear regression. It is used to model the relationship between a dependent variable (also known as the response variable) and one or more independent variables (also known as predictor variables or features). The primary goal of a regression line is to provide an estimate of the dependent variable based on the values of the independent variable(s).

\[ Y = a + bX + \epsilon \]

Calculation

Create Vectors

x<-c(murders$population)#x
y<-c(murders$total)#y

Sum of age

sum_x<-sum(x)
sum_x

## [1] 309864228

Length of age

n<-length(x)
n

## [1] 51

Sum of time

sum_y<-sum(y)
sum_y

## [1] 9403

Length of time

n<-length(x)
n

## [1] 51

Multiply x and y

xy<-x*y
xy

##  [1]   645264360    13494389  1482947944   271180374 46828222692   326897740
##  [7]   346687409    34121492    59570577 13171039857  3729920000     9522107
## [13]    18810984  4670350048   920699884    63973455   179746434   503366572
## [19]  1591213572    14611971  1691650736   772620222  4081943320   281108025
## [25]   356075640  1922445567    11872980    58442912   226846284     6582350
## [31]  2162805924   137964993 10018478734  2727148138     2690364  3576316240
## [37]   416399961   137918664  5804987203    16841072   957450348     6513440
## [43]  1389796995 20242176605    60805470     1251482  2000256000   625382220
## [49]    50030838   551637642     2818130

Sum of xy

sum_xy<-sum(xy)
sum_xy

## [1] 135180900360

Length of sum of xy

sum(xy)*n

## [1] 6.894226e+12

Sum of x * sum of y

sum(x)*sum(y)

## [1] 2.913653e+12

Numerator

num<-(sum(xy)*n)-(sum(x)*sum(y))
num

## [1] 3.980573e+12

Square of x

sq_x<-x^2
sq_x

##  [1] 2.284588e+13 5.044281e+11 4.085788e+13 8.502578e+12 1.387857e+15
##  [6] 2.529281e+13 1.277417e+13 8.062855e+11 3.620706e+11 3.876037e+14
## [11] 9.840640e+13 1.850419e+12 2.457313e+12 1.646251e+14 4.203969e+13
## [16] 9.280279e+12 8.140282e+12 1.883011e+13 2.055146e+13 1.764543e+12
## [21] 3.333390e+13 4.287145e+13 9.768634e+13 2.813162e+13 8.804851e+12
## [26] 3.586725e+13 9.789420e+11 3.335521e+12 7.292976e+12 1.733093e+12
## [31] 7.729740e+13 4.240218e+12 3.755108e+14 9.092544e+13 4.523787e+11
## [36] 1.330909e+14 1.407263e+13 1.467713e+13 1.613504e+14 1.107897e+12
## [41] 2.139399e+13 6.628891e+11 4.027305e+13 6.322992e+14 7.639060e+12
## [46] 3.915518e+11 6.401639e+13 4.521944e+13 3.433587e+12 3.234181e+13
## [51] 3.176743e+11

Sum of sq of x

sum_sq_x<-sum(sq_x)
sum_sq_x

## [1] 4.236103e+15

Length of sum of square of x

n*sum_sq_x

## [1] 2.160412e+17

Square of sum of x

sq_sum_x<-(sum_x)^2
sq_sum_x

## [1] 9.601584e+16

denominator

deno<-(n*sum_sq_x)-(sq_sum_x)
deno

## [1] 1.200254e+17

Regression coefficient

b<-num/deno
b

## [1] 3.316442e-05

Mean of x

mean_x<- sum_x/n
mean_x #or

## [1] 6075769

mean(x)

## [1] 6075769

Mean of y

mean_y<-sum_y/n

Regression intercept

a<-(mean_y-b*mean_x)
a

## [1] -17.12682

Regression line

y<-a+b*x
cat(y)

## 141.3904 6.42758 194.8607 79.57791 1218.379 149.6636 101.406 12.65264 2.828975 635.8028 311.8642 27.98678 34.86113 408.3937 197.9047 83.90378 77.49519 126.7858 133.2198 26.9275 174.3497 200.0215 310.6584 158.7748 81.28187 181.4925 15.68656 43.44272 72.43539 26.53315 274.4513 51.16466 625.5367 299.112 5.179271 365.4747 107.2846 109.9285 404.1402 17.78096 136.2707 9.874988 193.3381 816.8112 74.53583 3.625518 248.2225 205.8887 44.32665 171.4788 1.56551

Alternate method:-

x <- c(murders$population)
y <- c(murders$total)

Fit a linear model

model <- lm(y ~ x)

Extract the intercept (a) and slope (b) coefficients

a <- coef(model)[1]
b <- coef(model)[2]

Regression Line

cat("Regression Equation: y =", a, "+", b, "x\n")

## Regression Equation: y = -17.12682 + 3.316442e-05 x

Assignment_001

Fakhar Mahmood(DS-5121106)

2023-10-05

1.How we load dataset?

2.Store the state names in a vectors?

3.Store the state abberivation in a vectors?

4.What is the total population of USA?

5.how many murders have been done according to this dataset

6.which state has highest population?

7.which state has highest no of murders?

8.which state has lowest no of murders?

9.which state has lowest no of population?

10.Display population of Washington?

11.Display no of murders at Alaska?

12.compute correlation between population and number of murders.

Correlation:

Formula:

Calculation

Create vectors

Multiplication of vectors

Sum of xy

Length multiply with the multiplications of vectors

Sum x

Sum y

Sum of x * sum of y

Numerator

Square of x

Sum of square of x

Multiply length of sum of sqyare of x

Square of sum x

Num

Square of y

Sum of square of y

Length of sum of y

Square of sum of y

Denominator

sq root of denominator

Alternate way of Correlation

Interpreation:

13.Fit regression line of population on number of murders.

Regression line:

Calculation

Create Vectors

Sum of age

Length of age

Sum of time

Length of time

Multiply x and y

Sum of xy

Length of sum of xy

Sum of x * sum of y

Numerator

Square of x

Sum of sq of x

Length of sum of square of x

Square of sum of x

denominator

Regression coefficient

Mean of x

Mean of y

Regression intercept

Regression line

Alternate method:-

Fit a linear model

Extract the intercept (a) and slope (b) coefficients

Regression Line