# the csv is stored in my github and below is the link
path <- ('https://raw.githubusercontent.com/karmaggyatso/CUNY_SPS/main/week3/CigarettesB.csv')
cigData= read.csv(path)
#1 Data Exploration: This should include summary statistics, means, medians, quartiles, or any other relevant information about the data set. Please include some conclusions in the R Markdown text.
summary(cigData)
## X packs price income
## Length:46 Min. :4.409 Min. :-0.0326 Min. :4.529
## Class :character 1st Qu.:4.712 1st Qu.: 0.1405 1st Qu.:4.679
## Mode :character Median :4.815 Median : 0.2002 Median :4.759
## Mean :4.848 Mean : 0.2055 Mean :4.775
## 3rd Qu.:4.984 3rd Qu.: 0.2735 3rd Qu.:4.853
## Max. :5.379 Max. : 0.3640 Max. :5.103
#means for column packs
mean(cigData$packs)
## [1] 4.847844
#median for column packs
median(cigData$packs)
## [1] 4.81495
#SD for column packs
sd(cigData$packs)
## [1] 0.1914581
#range for column packs
range(cigData$packs)
## [1] 4.40859 5.37906
#mean for column price
mean(cigData$price)
## [1] 0.2055087
#median for column price
median(cigData$price)
## [1] 0.200205
#SD for column price
sd(cigData$price)
## [1] 0.08623038
#range for column price
range(cigData$price)
## [1] -0.03260 0.36399
#mean for column income
#income
mean(cigData$income)
## [1] 4.775455
#median for column income
median(cigData$income)
## [1] 4.758505
#SD for column income
sd(cigData$income)
## [1] 0.1422612
#range for column income
range(cigData$income)
## [1] 4.52938 5.10268
dataDf <- data.frame(cigData$packs,cigData$price,cigData$income)
dataDf
## cigData.packs cigData.price cigData.income
## 1 4.96213 0.20487 4.64039
## 2 4.66312 0.16640 4.68389
## 3 5.10709 0.23406 4.59435
## 4 4.50449 0.36399 4.88147
## 5 4.66983 0.32149 5.09472
## 6 5.04705 0.21929 4.87087
## 7 4.65637 0.28946 5.05960
## 8 4.80081 0.28733 4.81155
## 9 4.97974 0.12826 4.73299
## 10 4.74902 0.17541 4.64307
## 11 4.81445 0.24806 4.90387
## 12 5.11129 0.08992 4.72916
## 13 4.80857 0.24081 4.74211
## 14 4.79263 0.21642 4.79613
## 15 5.37906 -0.03260 4.64937
## 16 4.98602 0.23856 4.61461
## 17 4.98722 0.29106 4.75501
## 18 4.77751 0.12575 4.94692
## 19 4.73877 0.22613 4.99998
## 20 4.94744 0.23067 4.80620
## 21 4.69589 0.34297 4.81207
## 22 4.93990 0.13638 4.52938
## 23 5.06430 0.08731 4.78189
## 24 4.73313 0.15303 4.70417
## 25 4.77558 0.18907 4.79671
## 26 4.96642 0.32304 4.83816
## 27 5.10990 0.15852 5.00319
## 28 4.70633 0.30901 5.10268
## 29 4.58107 0.16458 4.58202
## 30 4.66496 0.34701 4.96075
## 31 4.58237 0.18197 4.69163
## 32 4.97952 0.12889 4.75875
## 33 4.72720 0.19554 4.62730
## 34 4.80363 0.22784 4.83516
## 35 4.84693 0.30324 4.84670
## 36 5.07801 0.07944 4.62549
## 37 4.81545 0.13139 4.67747
## 38 5.04939 0.15547 4.72525
## 39 4.65398 0.28196 4.73437
## 40 4.40859 0.19260 4.55586
## 41 5.08799 0.18018 4.77578
## 42 4.93065 0.11818 4.85490
## 43 4.66134 0.35053 4.85645
## 44 4.82454 0.12008 4.56859
## 45 4.83026 0.22954 4.75826
## 46 5.00087 0.10029 4.71169
colnames(dataDf) <- c("total_number_of_cig_packs", "price_per_pack", "grossIncome")
dataDf['totalCost'] <- dataDf$total_number_of_cig_packs * dataDf$price_per_pack
dataDf['netIncome'] <- abs(dataDf$grossIncome - dataDf$totalCost)
dataDf
## total_number_of_cig_packs price_per_pack grossIncome totalCost netIncome
## 1 4.96213 0.20487 4.64039 1.0165916 3.623798
## 2 4.66312 0.16640 4.68389 0.7759432 3.907947
## 3 5.10709 0.23406 4.59435 1.1953655 3.398985
## 4 4.50449 0.36399 4.88147 1.6395893 3.241881
## 5 4.66983 0.32149 5.09472 1.5013036 3.593416
## 6 5.04705 0.21929 4.87087 1.1067676 3.764102
## 7 4.65637 0.28946 5.05960 1.3478329 3.711767
## 8 4.80081 0.28733 4.81155 1.3794167 3.432133
## 9 4.97974 0.12826 4.73299 0.6387015 4.094289
## 10 4.74902 0.17541 4.64307 0.8330256 3.810044
## 11 4.81445 0.24806 4.90387 1.1942725 3.709598
## 12 5.11129 0.08992 4.72916 0.4596072 4.269553
## 13 4.80857 0.24081 4.74211 1.1579517 3.584158
## 14 4.79263 0.21642 4.79613 1.0372210 3.758909
## 15 5.37906 -0.03260 4.64937 -0.1753574 4.824727
## 16 4.98602 0.23856 4.61461 1.1894649 3.425145
## 17 4.98722 0.29106 4.75501 1.4515803 3.303430
## 18 4.77751 0.12575 4.94692 0.6007719 4.346148
## 19 4.73877 0.22613 4.99998 1.0715781 3.928402
## 20 4.94744 0.23067 4.80620 1.1412260 3.664974
## 21 4.69589 0.34297 4.81207 1.6105494 3.201521
## 22 4.93990 0.13638 4.52938 0.6737036 3.855676
## 23 5.06430 0.08731 4.78189 0.4421640 4.339726
## 24 4.73313 0.15303 4.70417 0.7243109 3.979859
## 25 4.77558 0.18907 4.79671 0.9029189 3.893791
## 26 4.96642 0.32304 4.83816 1.6043523 3.233808
## 27 5.10990 0.15852 5.00319 0.8100213 4.193169
## 28 4.70633 0.30901 5.10268 1.4543030 3.648377
## 29 4.58107 0.16458 4.58202 0.7539525 3.828067
## 30 4.66496 0.34701 4.96075 1.6187878 3.341962
## 31 4.58237 0.18197 4.69163 0.8338539 3.857776
## 32 4.97952 0.12889 4.75875 0.6418103 4.116940
## 33 4.72720 0.19554 4.62730 0.9243567 3.702943
## 34 4.80363 0.22784 4.83516 1.0944591 3.740701
## 35 4.84693 0.30324 4.84670 1.4697831 3.376917
## 36 5.07801 0.07944 4.62549 0.4033971 4.222093
## 37 4.81545 0.13139 4.67747 0.6327020 4.044768
## 38 5.04939 0.15547 4.72525 0.7850287 3.940221
## 39 4.65398 0.28196 4.73437 1.3122362 3.422134
## 40 4.40859 0.19260 4.55586 0.8490944 3.706766
## 41 5.08799 0.18018 4.77578 0.9167540 3.859026
## 42 4.93065 0.11818 4.85490 0.5827042 4.272196
## 43 4.66134 0.35053 4.85645 1.6339395 3.222510
## 44 4.82454 0.12008 4.56859 0.5793308 3.989259
## 45 4.83026 0.22954 4.75826 1.1087379 3.649522
## 46 5.00087 0.10029 4.71169 0.5015373 4.210153
#scatterplot
plot(x=dataDf$grossIncome,y=dataDf$totalCost)
#histogram
hist(dataDf$total_number_of_cig_packs)
cor(dataDf$grossIncome,dataDf$totalCost)
## [1] 0.4878801