#install.packages("carData")
library(carData)
data(Davis)
mydata1<-print(Davis)
## sex weight height repwt repht
## 1 M 77 182 77 180
## 2 F 58 161 51 159
## 3 F 53 161 54 158
## 4 M 68 177 70 175
## 5 F 59 157 59 155
## 6 M 76 170 76 165
## 7 M 76 167 77 165
## 8 M 69 186 73 180
## 9 M 71 178 71 175
## 10 M 65 171 64 170
## 11 M 70 175 75 174
## 12 F 166 57 56 163
## 13 F 51 161 52 158
## 14 F 64 168 64 165
## 15 F 52 163 57 160
## 16 F 65 166 66 165
## 17 M 92 187 101 185
## 18 F 62 168 62 165
## 19 M 76 197 75 200
## 20 F 61 175 61 171
## 21 M 119 180 124 178
## 22 F 61 170 61 170
## 23 M 65 175 66 173
## 24 M 66 173 70 170
## 25 F 54 171 59 168
## 26 F 50 166 50 165
## 27 F 63 169 61 168
## 28 F 58 166 60 160
## 29 F 39 157 41 153
## 30 M 101 183 100 180
## 31 F 71 166 71 165
## 32 M 75 178 73 175
## 33 M 79 173 76 173
## 34 F 52 164 52 161
## 35 F 68 169 63 170
## 36 M 64 176 65 175
## 37 F 56 166 54 165
## 38 M 69 174 69 171
## 39 M 88 178 86 175
## 40 M 65 187 67 188
## 41 F 54 164 53 160
## 42 M 80 178 80 178
## 43 F 63 163 59 159
## 44 M 78 183 80 180
## 45 M 85 179 82 175
## 46 F 54 160 55 158
## 47 M 73 180 NA NA
## 48 F 49 161 NA NA
## 49 F 54 174 56 173
## 50 F 75 162 75 158
## 51 M 82 182 85 183
## 52 F 56 165 57 163
## 53 M 74 169 73 170
## 54 M 102 185 107 185
## 55 M 64 177 NA NA
## 56 M 65 176 64 172
## 57 F 66 170 65 NA
## 58 M 73 183 74 180
## 59 M 75 172 70 169
## 60 M 57 173 58 170
## 61 M 68 165 69 165
## 62 M 71 177 71 170
## 63 M 71 180 76 175
## 64 F 78 173 75 169
## 65 M 97 189 98 185
## 66 F 60 162 59 160
## 67 F 64 165 63 163
## 68 F 64 164 62 161
## 69 F 52 158 51 155
## 70 M 80 178 76 175
## 71 F 62 175 61 171
## 72 M 66 173 66 175
## 73 F 55 165 54 163
## 74 F 56 163 57 159
## 75 F 50 166 50 161
## 76 F 50 171 NA NA
## 77 F 50 160 55 150
## 78 F 63 160 64 158
## 79 M 69 182 70 180
## 80 M 69 183 70 183
## 81 F 61 165 60 163
## 82 M 55 168 56 170
## 83 F 53 169 52 175
## 84 F 60 167 55 163
## 85 F 56 170 56 170
## 86 M 59 182 61 183
## 87 M 62 178 66 175
## 88 F 53 165 53 165
## 89 F 57 163 59 160
## 90 F 57 162 56 160
## 91 M 70 173 68 170
## 92 F 56 161 56 161
## 93 M 84 184 86 183
## 94 M 69 180 71 180
## 95 M 88 189 87 185
## 96 F 56 165 57 160
## 97 M 103 185 101 182
## 98 F 50 169 50 165
## 99 F 52 159 52 153
## 100 F 55 155 NA 154
## 101 F 55 164 55 163
## 102 M 63 178 63 175
## 103 F 47 163 47 160
## 104 F 45 163 45 160
## 105 F 62 175 63 173
## 106 F 53 164 51 160
## 107 F 52 152 51 150
## 108 F 57 167 55 164
## 109 F 64 166 64 165
## 110 F 59 166 55 163
## 111 M 84 183 90 183
## 112 M 79 179 79 171
## 113 F 55 174 57 171
## 114 M 67 179 67 179
## 115 F 76 167 77 165
## 116 F 62 168 62 163
## 117 M 83 184 83 181
## 118 M 96 184 94 183
## 119 M 75 169 76 165
## 120 M 65 178 66 178
## 121 M 78 178 77 175
## 122 M 69 167 73 165
## 123 F 68 178 68 175
## 124 F 55 165 55 163
## 125 M 67 179 NA NA
## 126 F 52 169 56 NA
## 127 F 47 153 NA 154
## 128 F 45 157 45 153
## 129 F 68 171 68 169
## 130 F 44 157 44 155
## 131 F 62 166 61 163
## 132 M 87 185 89 185
## 133 F 56 160 53 158
## 134 F 50 148 47 148
## 135 M 83 177 84 175
## 136 F 53 162 53 160
## 137 F 64 172 62 168
## 138 F 62 167 NA NA
## 139 M 90 188 91 185
## 140 M 85 191 83 188
## 141 M 66 175 68 175
## 142 F 52 163 53 160
## 143 F 53 165 55 163
## 144 F 54 176 55 176
## 145 F 64 171 66 171
## 146 F 55 160 55 155
## 147 F 55 165 55 165
## 148 F 59 157 55 158
## 149 F 70 173 67 170
## 150 M 88 184 86 183
## 151 F 57 168 58 165
## 152 F 47 162 47 160
## 153 F 47 150 45 152
## 154 F 55 162 NA NA
## 155 F 48 163 44 160
## 156 M 54 169 58 165
## 157 M 69 172 68 174
## 158 F 59 170 NA NA
## 159 F 58 169 NA NA
## 160 F 57 167 56 165
## 161 F 51 163 50 160
## 162 F 54 161 54 160
## 163 F 53 162 52 158
## 164 F 59 172 58 171
## 165 M 56 163 58 161
## 166 F 59 159 59 155
## 167 F 63 170 62 168
## 168 F 66 166 66 165
## 169 M 96 191 95 188
## 170 F 53 158 50 155
## 171 M 76 169 75 165
## 172 F 54 163 NA NA
## 173 M 61 170 61 170
## 174 M 82 176 NA NA
## 175 M 62 168 64 168
## 176 M 71 178 68 178
## 177 F 60 174 NA NA
## 178 M 66 170 67 165
## 179 M 81 178 82 175
## 180 M 68 174 68 173
## 181 M 80 176 78 175
## 182 F 43 154 NA NA
## 183 M 82 181 NA NA
## 184 F 63 165 59 160
## 185 M 70 173 70 173
## 186 F 56 162 56 160
## 187 F 60 172 55 168
## 188 F 58 169 54 166
## 189 M 76 183 75 180
## 190 F 50 158 49 155
## 191 M 88 185 93 188
## 192 M 89 173 86 173
## 193 F 59 164 59 165
## 194 F 51 156 51 158
## 195 F 62 164 61 161
## 196 M 74 175 71 175
## 197 M 83 180 80 180
## 198 M 81 175 NA NA
## 199 M 90 181 91 178
## 200 M 79 177 81 178
Description:
colnames(mydata1)<-c("GenderF", "Weight", "Height", "WeightRep", "HeightRep")
head(mydata1)
## GenderF Weight Height WeightRep HeightRep
## 1 M 77 182 77 180
## 2 F 58 161 51 159
## 3 F 53 161 54 158
## 4 M 68 177 70 175
## 5 F 59 157 59 155
## 6 M 76 170 76 165
mydata1$ID <- seq.int(nrow(mydata1))
mydata1$GenderF <- ifelse(test = mydata1$GenderF == "F",
yes = "Women",
no = "Men")
mydata1$WeightDeviation <- mydata1$Weight-mydata1$WeightRep
mydata1$HeightDeviation <- mydata1$Height-mydata1$HeightRep
mydata1.1 <- mydata1[,c(6,1,2,4,7,3,5,8)]
head(mydata1.1)
## ID GenderF Weight WeightRep WeightDeviation Height HeightRep HeightDeviation
## 1 1 Men 77 77 0 182 180 2
## 2 2 Women 58 51 7 161 159 2
## 3 3 Women 53 54 -1 161 158 3
## 4 4 Men 68 70 -2 177 175 2
## 5 5 Women 59 59 0 157 155 2
## 6 6 Men 76 76 0 170 165 5
sum(is.na(mydata1.1))
## [1] 68
#install.packages("tidyr")
suppressPackageStartupMessages(library(tidyr))
mydata1.1 <- drop_na(mydata1.1)
sapply(mydata1.1[,c(3:8)], FUN=max)
## Weight WeightRep WeightDeviation Height HeightRep HeightDeviation
## 166 124 110 197 200 10
sapply(mydata1.1[,c(-1,-2)], FUN=min)
## Weight WeightRep WeightDeviation Height HeightRep HeightDeviation
## 39 41 -9 57 148 -106
mydata1.1[order(mydata1.1$Height),]
## ID GenderF Weight WeightRep WeightDeviation Height HeightRep HeightDeviation
## 12 12 Women 166 56 110 57 163 -106
## 134 134 Women 50 47 3 148 148 0
## 153 153 Women 47 45 2 150 152 -2
## 107 107 Women 52 51 1 152 150 2
## 194 194 Women 51 51 0 156 158 -2
## 5 5 Women 59 59 0 157 155 2
## 29 29 Women 39 41 -2 157 153 4
## 128 128 Women 45 45 0 157 153 4
## 130 130 Women 44 44 0 157 155 2
## 148 148 Women 59 55 4 157 158 -1
## 69 69 Women 52 51 1 158 155 3
## 170 170 Women 53 50 3 158 155 3
## 190 190 Women 50 49 1 158 155 3
## 99 99 Women 52 52 0 159 153 6
## 166 166 Women 59 59 0 159 155 4
## 46 46 Women 54 55 -1 160 158 2
## 77 77 Women 50 55 -5 160 150 10
## 78 78 Women 63 64 -1 160 158 2
## 133 133 Women 56 53 3 160 158 2
## 146 146 Women 55 55 0 160 155 5
## 2 2 Women 58 51 7 161 159 2
## 3 3 Women 53 54 -1 161 158 3
## 13 13 Women 51 52 -1 161 158 3
## 92 92 Women 56 56 0 161 161 0
## 162 162 Women 54 54 0 161 160 1
## 50 50 Women 75 75 0 162 158 4
## 66 66 Women 60 59 1 162 160 2
## 90 90 Women 57 56 1 162 160 2
## 136 136 Women 53 53 0 162 160 2
## 152 152 Women 47 47 0 162 160 2
## 163 163 Women 53 52 1 162 158 4
## 186 186 Women 56 56 0 162 160 2
## 15 15 Women 52 57 -5 163 160 3
## 43 43 Women 63 59 4 163 159 4
## 74 74 Women 56 57 -1 163 159 4
## 89 89 Women 57 59 -2 163 160 3
## 103 103 Women 47 47 0 163 160 3
## 104 104 Women 45 45 0 163 160 3
## 142 142 Women 52 53 -1 163 160 3
## 155 155 Women 48 44 4 163 160 3
## 161 161 Women 51 50 1 163 160 3
## 165 165 Men 56 58 -2 163 161 2
## 34 34 Women 52 52 0 164 161 3
## 41 41 Women 54 53 1 164 160 4
## 68 68 Women 64 62 2 164 161 3
## 101 101 Women 55 55 0 164 163 1
## 106 106 Women 53 51 2 164 160 4
## 193 193 Women 59 59 0 164 165 -1
## 195 195 Women 62 61 1 164 161 3
## 52 52 Women 56 57 -1 165 163 2
## 61 61 Men 68 69 -1 165 165 0
## 67 67 Women 64 63 1 165 163 2
## 73 73 Women 55 54 1 165 163 2
## 81 81 Women 61 60 1 165 163 2
## 88 88 Women 53 53 0 165 165 0
## 96 96 Women 56 57 -1 165 160 5
## 124 124 Women 55 55 0 165 163 2
## 143 143 Women 53 55 -2 165 163 2
## 147 147 Women 55 55 0 165 165 0
## 184 184 Women 63 59 4 165 160 5
## 16 16 Women 65 66 -1 166 165 1
## 26 26 Women 50 50 0 166 165 1
## 28 28 Women 58 60 -2 166 160 6
## 31 31 Women 71 71 0 166 165 1
## 37 37 Women 56 54 2 166 165 1
## 75 75 Women 50 50 0 166 161 5
## 109 109 Women 64 64 0 166 165 1
## 110 110 Women 59 55 4 166 163 3
## 131 131 Women 62 61 1 166 163 3
## 168 168 Women 66 66 0 166 165 1
## 7 7 Men 76 77 -1 167 165 2
## 84 84 Women 60 55 5 167 163 4
## 108 108 Women 57 55 2 167 164 3
## 115 115 Women 76 77 -1 167 165 2
## 122 122 Men 69 73 -4 167 165 2
## 160 160 Women 57 56 1 167 165 2
## 14 14 Women 64 64 0 168 165 3
## 18 18 Women 62 62 0 168 165 3
## 82 82 Men 55 56 -1 168 170 -2
## 116 116 Women 62 62 0 168 163 5
## 151 151 Women 57 58 -1 168 165 3
## 175 175 Men 62 64 -2 168 168 0
## 27 27 Women 63 61 2 169 168 1
## 35 35 Women 68 63 5 169 170 -1
## 53 53 Men 74 73 1 169 170 -1
## 83 83 Women 53 52 1 169 175 -6
## 98 98 Women 50 50 0 169 165 4
## 119 119 Men 75 76 -1 169 165 4
## 156 156 Men 54 58 -4 169 165 4
## 171 171 Men 76 75 1 169 165 4
## 188 188 Women 58 54 4 169 166 3
## 6 6 Men 76 76 0 170 165 5
## 22 22 Women 61 61 0 170 170 0
## 85 85 Women 56 56 0 170 170 0
## 167 167 Women 63 62 1 170 168 2
## 173 173 Men 61 61 0 170 170 0
## 178 178 Men 66 67 -1 170 165 5
## 10 10 Men 65 64 1 171 170 1
## 25 25 Women 54 59 -5 171 168 3
## 129 129 Women 68 68 0 171 169 2
## 145 145 Women 64 66 -2 171 171 0
## 59 59 Men 75 70 5 172 169 3
## 137 137 Women 64 62 2 172 168 4
## 157 157 Men 69 68 1 172 174 -2
## 164 164 Women 59 58 1 172 171 1
## 187 187 Women 60 55 5 172 168 4
## 24 24 Men 66 70 -4 173 170 3
## 33 33 Men 79 76 3 173 173 0
## 60 60 Men 57 58 -1 173 170 3
## 64 64 Women 78 75 3 173 169 4
## 72 72 Men 66 66 0 173 175 -2
## 91 91 Men 70 68 2 173 170 3
## 149 149 Women 70 67 3 173 170 3
## 185 185 Men 70 70 0 173 173 0
## 192 192 Men 89 86 3 173 173 0
## 38 38 Men 69 69 0 174 171 3
## 49 49 Women 54 56 -2 174 173 1
## 113 113 Women 55 57 -2 174 171 3
## 180 180 Men 68 68 0 174 173 1
## 11 11 Men 70 75 -5 175 174 1
## 20 20 Women 61 61 0 175 171 4
## 23 23 Men 65 66 -1 175 173 2
## 71 71 Women 62 61 1 175 171 4
## 105 105 Women 62 63 -1 175 173 2
## 141 141 Men 66 68 -2 175 175 0
## 196 196 Men 74 71 3 175 175 0
## 36 36 Men 64 65 -1 176 175 1
## 56 56 Men 65 64 1 176 172 4
## 144 144 Women 54 55 -1 176 176 0
## 181 181 Men 80 78 2 176 175 1
## 4 4 Men 68 70 -2 177 175 2
## 62 62 Men 71 71 0 177 170 7
## 135 135 Men 83 84 -1 177 175 2
## 200 200 Men 79 81 -2 177 178 -1
## 9 9 Men 71 71 0 178 175 3
## 32 32 Men 75 73 2 178 175 3
## 39 39 Men 88 86 2 178 175 3
## 42 42 Men 80 80 0 178 178 0
## 70 70 Men 80 76 4 178 175 3
## 87 87 Men 62 66 -4 178 175 3
## 102 102 Men 63 63 0 178 175 3
## 120 120 Men 65 66 -1 178 178 0
## 121 121 Men 78 77 1 178 175 3
## 123 123 Women 68 68 0 178 175 3
## 176 176 Men 71 68 3 178 178 0
## 179 179 Men 81 82 -1 178 175 3
## 45 45 Men 85 82 3 179 175 4
## 112 112 Men 79 79 0 179 171 8
## 114 114 Men 67 67 0 179 179 0
## 21 21 Men 119 124 -5 180 178 2
## 63 63 Men 71 76 -5 180 175 5
## 94 94 Men 69 71 -2 180 180 0
## 197 197 Men 83 80 3 180 180 0
## 199 199 Men 90 91 -1 181 178 3
## 1 1 Men 77 77 0 182 180 2
## 51 51 Men 82 85 -3 182 183 -1
## 79 79 Men 69 70 -1 182 180 2
## 86 86 Men 59 61 -2 182 183 -1
## 30 30 Men 101 100 1 183 180 3
## 44 44 Men 78 80 -2 183 180 3
## 58 58 Men 73 74 -1 183 180 3
## 80 80 Men 69 70 -1 183 183 0
## 111 111 Men 84 90 -6 183 183 0
## 189 189 Men 76 75 1 183 180 3
## 93 93 Men 84 86 -2 184 183 1
## 117 117 Men 83 83 0 184 181 3
## 118 118 Men 96 94 2 184 183 1
## 150 150 Men 88 86 2 184 183 1
## 54 54 Men 102 107 -5 185 185 0
## 97 97 Men 103 101 2 185 182 3
## 132 132 Men 87 89 -2 185 185 0
## 191 191 Men 88 93 -5 185 188 -3
## 8 8 Men 69 73 -4 186 180 6
## 17 17 Men 92 101 -9 187 185 2
## 40 40 Men 65 67 -2 187 188 -1
## 139 139 Men 90 91 -1 188 185 3
## 65 65 Men 97 98 -1 189 185 4
## 95 95 Men 88 87 1 189 185 4
## 140 140 Men 85 83 2 191 188 3
## 169 169 Men 96 95 1 191 188 3
## 19 19 Men 76 75 1 197 200 -3
mydata1.2 <- mydata1.1[-12,]
sapply(mydata1.2[,c(3:8)], FUN=max)
## Weight WeightRep WeightDeviation Height HeightRep HeightDeviation
## 119 124 7 197 200 10
sapply(mydata1.2[,c(-1,-2)], FUN=min)
## Weight WeightRep WeightDeviation Height HeightRep HeightDeviation
## 39 41 -9 148 148 -6
#install.packages("pastecs")
suppressPackageStartupMessages(library(pastecs))
round(stat.desc(mydata1.2[,c(-1,-2)]), 2)
## Weight WeightRep WeightDeviation Height HeightRep HeightDeviation
## nbr.val 180.00 180.00 180.00 180.00 180.00 180.00
## nbr.null 0.00 0.00 49.00 0.00 0.00 26.00
## nbr.na 0.00 0.00 0.00 0.00 0.00 0.00
## min 39.00 41.00 -9.00 148.00 148.00 -6.00
## max 119.00 124.00 7.00 197.00 200.00 10.00
## range 80.00 83.00 16.00 49.00 52.00 16.00
## sum 11835.00 11832.00 3.00 30741.00 30364.00 377.00
## median 63.00 63.00 0.00 169.50 168.00 2.00
## mean 65.75 65.73 0.02 170.78 168.69 2.09
## SE.mean 1.00 1.03 0.17 0.67 0.70 0.15
## CI.mean.0.95 1.98 2.04 0.34 1.32 1.38 0.31
## var 180.83 191.93 5.35 80.51 88.57 4.32
## std.dev 13.45 13.85 2.31 8.97 9.41 2.08
## coef.var 0.20 0.21 138.73 0.05 0.06 0.99
Description:
Minimum deviation in reported weight from actual weight is -9kg, which means that person reported that they weigh 9kg less than they actually do. The maximum deviation in weight was 7kg (reported weight was 7kg higher than actual one). Consequently the range is 16kg. In hight the minimum deviation was -6cm and the maximum was 10cm, meaning the range equals 16cm.
In reporting the weight 50% of those who were questioned reported the weight lower than their actual one and 50% more. In reporting their height, however, 50% of those who were questioned reported the height lower than their actual height minus 2cm and 50% more than that. The average deviation from actual weight in reported weight was 0.02kg and in height 2.09cm. Provided the sample was random, the true mean of weight deviation would be somewhere on: (-0.32kg, 0.36kg) (alfa=5.00%) and the true mean of height deviation would be somewhere on (1.78cm, 2.40cm) (alfa=5,00%).
Variability of the 2 variables (HeightDeviation and WeightDeviation) can be compared by coefficient of variance, which is much higher in WeightDeviation and so is, consequently, also its variability.
#install.packages("psych")
suppressPackageStartupMessages(library(psych))
describeBy(mydata1.2$WeightDeviation, mydata1.2$GenderF)
##
## Descriptive statistics by group
## group: Men
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 82 -0.59 2.49 -0.5 -0.47 2.22 -9 5 14 -0.55 0.61 0.27
## ------------------------------------------------------------------------------------------
## group: Women
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 98 0.52 2.03 0 0.45 1.48 -5 7 12 0.28 1.32 0.21
We can see that women have a slightly smaller range in WeightDeviation. While on average women deviated in their reported weight from actual weight by 0.52kg, men deviated by -0.59. 50% of women deviated in their report from actual weight by less than 0, while 50% did more than 0. 50% of men on the other hand deviated in report from their actual weight less than -0.5 and 50% more than this. Results show us that more men than not reported their weight to be higher than their actual one, whereas women, although half of them had less than 0 and half of them more than 0kg of deviation, on average tended to report a smaller weight than their actual one.
describeBy(mydata1.2$HeightDeviation, mydata1.2$GenderF)
##
## Descriptive statistics by group
## group: Men
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 82 1.76 2.15 2 1.74 2.22 -3 8 11 0.14 0.02 0.24
## ------------------------------------------------------------------------------------------
## group: Women
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 98 2.38 1.99 3 2.44 1.48 -6 10 16 -0.4 3.82 0.2
With HeightDeviation women have a bigger range than men. However, both men and women tended to more often report a smaller height than their actual one was, compared to reporting a higher one. 50% of women reported a height more than 3cm smaller than their actual one and 50% less than 3cm smaller than their actual one. 50% of men reported a height more than 2cm smaller than their actual one and 50% less than 2cm smaller than their actual one. On average women reported a 2.38cm and men 1.76cm smaller height than their actual one.
hist(mydata1.2$WeightDeviation,
main = "Distribution of deviations in reported weight from actual weight",
xlab = "Deviation of weight",
ylab = "Frequency",
breaks = seq(from = -10, to = 10, by = 1))
#install.packages("ggplot2)
library(ggplot2)
ggplot(mydata1.2, aes(x=WeightDeviation, fill=GenderF)) +
geom_histogram(position="dodge", binwidth = 1, colour="gray") +
ylab("Frequency") +
xlab("Weight-Reported weight") +
scale_fill_brewer(palette="PRGn")
In the first histogram we can see that deviation in reported weight from actual weight mostly equals 0. Further away from 0 deviation we go (in either direction) less frequent the given deviation is, with exception of -5, where we have another peak, although this peak is much lower then the one in 0.
Second histogram shows us a distribution of the same variable just made separately for women and men. Many more women than men reported the actual weight. Otherwise there are not any major differences. We can observe, though, that on the far left (where those who reported a higher than actual weight are) men are more frequent, and on the far right (where those who reported a lower than actual weight are) women are more frequent.
hist(mydata1.2$HeightDeviation,
main = "Distribution of deviations in reported height from actual height",
xlab = "Deviation of height",
ylab = "Frequency",
breaks = seq(from = -10, to = 10, by = 1))
library(ggplot2)
ggplot(mydata1.2, aes(x=HeightDeviation, fill=GenderF)) +
geom_histogram(position="dodge", binwidth = 1, colour="gray") +
ylab("Frequency") +
xlab("Height-Reported height") +
scale_fill_brewer(palette="PRGn")
In the first histogram we can see that deviation in reported height from actual height mostly equals 3, but there are also high frequencies at 2 and 0. Deviation 1 in between is a bit less frequent. However, further to the sides from these values we go, lesser the frequency of deviation.
Second histogram shows us a distribution of the same variable just made separately for women and men. We can see that at 2 (the highest frequency for both together) women and men are very evenly distributed. The values where we can see large differences between the genders are 0 (more men), 2 and 4 (more women), otherwise frequencies between the two are pretty much the same.
scatterplot(HeightDeviation ~ WeightDeviation | GenderF,
ylab = "Height - Reported Height",
xlab = "Weight - Reported Weight",
smooth = FALSE,
data = mydata1.2)
In the scatterplot we can observe the relationship between deviation in weight and deviation in height. The drawn lines are not completely horizontal, interestingly, the slope of women’s line is positive while the men’s is slightly negative, however, the slopes are so small we can not really claim that they are correlated based on this graph.
ggplot(mydata1.2, aes(x=GenderF, y=WeightDeviation)) +
geom_boxplot() +
xlab("Gender") +
ylab("Weight-Reported weight")
From given boxplot we can see that the interquartile range for women’s weight deviation is much smaller than the men’s one. The median is higher in case of women, where we can also see that it actually equals the first quartile. This means that the weight deviation of the whole second quarter was 0 (= median). Though, the 3rd quartiles are the same for both. Men’s weight deviations are a more spread out, have higher standard deviation than women’s weight deviatons. However, although outliers seem to be present in both cases, there is many more in women’s case.
ggplot(mydata1.2, aes(x=GenderF, y=HeightDeviation)) +
geom_boxplot() +
xlab("Gender") +
ylab("Height-Reported height")
From this boxplot we can see that the interquartile range for women’s height deviation is again smaller than the men’s one. The median is again higher in case of women, where we can now see that it equals the third quartile. This means that the height deviation of the whole third quarter was 2 (= median). The 3rd quartiles, however, are again the same for both. Men’s height deviations are also once again more spread out, have higher standard deviation than women’s height deviatons, and also once again outliers seem to be present in both cases, but there is many more in case of women.
mydata2 <- read.table("C:/Users/TinaP/Desktop/EF/IMB/bootcamp/statistics/exam/R Take Home Exam/R Take Home Exam/Task 2/Body mass.csv", header=TRUE, sep=";", dec=",")
head(mydata2)
## ID Mass
## 1 1 62.1
## 2 2 64.5
## 3 3 56.5
## 4 4 53.4
## 5 5 61.3
## 6 6 62.2
Description:
library(pastecs)
round(stat.desc(mydata2[,-1]), 2)
## nbr.val nbr.null nbr.na min max range sum median mean
## 50.00 0.00 0.00 49.70 83.20 33.50 3143.80 62.80 62.88
## SE.mean CI.mean.0.95 var std.dev coef.var
## 0.85 1.71 36.14 6.01 0.10
Our sample contains 50 units. There are no missing values. Min (49.70kg) and max(83.20) number are not such that they would already at first glance indicate a mistake in data writing process. Range of the variable is 33.50kg. 50% of observed students have their body mess less than 62.80 kg and 50% more. If we were to generalize this to the whole population Mu would fall somewhere on the interval (61.17, 64.59), alfa=5,00%. Variance is 36.14 and standard deviation 6.01 kg, coefficient of variance is 10%.
hist(mydata2$Mass,
main = "Distribution of body mass",
xlab = "Body mass",
ylab = "Frequency",
breaks = seq(from = 40, to = 90, by = 5))
We can see that the distribution is unimodal, with majority of units falling somewhere in between 60 and 65.
H0: Mu = 59.5 H1: Mu =/ 59.5
t.test(mydata2$Mass,
mu = 59.5,
alternative = "two.sided")
##
## One Sample t-test
##
## data: mydata2$Mass
## t = 3.9711, df = 49, p-value = 0.000234
## alternative hypothesis: true mean is not equal to 59.5
## 95 percent confidence interval:
## 61.16758 64.58442
## sample estimates:
## mean of x
## 62.876
The test shows us that we can reject the H0 at p=0.000234 and can accept H1 - Mu does not equal 59.5kg (the average from year 2018/19). We can claim with 95% certainty that Mu is somewhere between 61.17kg and 64.58kg.
#install.packages("rstatix")
suppressPackageStartupMessages(library(rstatix))
mydata2 %>% cohens_d(Mass~1, mu=59.5)
## # A tibble: 1 x 6
## .y. group1 group2 effsize n magnitude
## * <chr> <chr> <chr> <dbl> <int> <ord>
## 1 Mass 1 null model 0.562 50 moderate
(mean(mydata2$Mass)- 59.5)/sd(mydata2$Mass)
## [1] 0.5615994
Cohen d effect size equals 0.56, which means the effect size is moderate and with it the difference between means as well. Practical significance is moderate.
We can conclude that the average body mass of 9th graders in 2020/21 is on average probably higher than the body mass of 9th graders in 2018/19, with high enough statistical significance (p=0.000234) and moderate practical significance.
#install.packages("readxl")
library(readxl)
mydata3 <- read_xlsx("C:/Users/TinaP/Desktop/EF/IMB/bootcamp/statistics/exam/R Take Home Exam/R Take Home Exam/Task 3/Apartments.xlsx")
head(mydata3)
## # A tibble: 6 x 5
## Age Distance Price Parking Balcony
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 7 28 1640 0 1
## 2 18 1 2800 1 0
## 3 7 28 1660 0 0
## 4 28 29 1850 0 1
## 5 18 18 1640 1 1
## 6 28 12 1770 0 1
Description:
mydata3$ParkingF <- factor(mydata3$Parking,
levels = c(0, 1),
labels = c("No", "Yes"))
mydata3$BalconyF <- factor(mydata3$Balcony,
levels = c(0, 1),
labels = c("No", "Yes"))
t.test(mydata3$Price,
mu=1900,
alternative = "two.sided")
##
## One Sample t-test
##
## data: mydata3$Price
## t = 2.9022, df = 84, p-value = 0.004731
## alternative hypothesis: true mean is not equal to 1900
## 95 percent confidence interval:
## 1937.443 2100.440
## sample estimates:
## mean of x
## 2018.941
Based on the sample data we reject H0: Mu_Price=1900 EUR at p=0.0047. It is very unlikely that average price per m2 of apartments in Ljubljana region equals 1900 EUR. We can also say that Mu_Price is on the interval (1937.443, 2100.440) with 95% certainty.
fit1 <- lm(Price ~ Age,
data = mydata3)
summary(fit1)
##
## Call:
## lm(formula = Price ~ Age, data = mydata3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -623.9 -278.0 -69.8 243.5 776.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2185.455 87.043 25.108 <2e-16 ***
## Age -8.975 4.164 -2.156 0.034 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 369.9 on 83 degrees of freedom
## Multiple R-squared: 0.05302, Adjusted R-squared: 0.04161
## F-statistic: 4.647 on 1 and 83 DF, p-value: 0.03401
If age of an apartment in Ljubljana region is increased by 1 year then its price gets on average decreased by 8.975 EUR per m2. (p=0.034)
5.30% of Ljubljana apartment’s price variability is explained by linear effect of apartment’s age.
sqrt(summary(fit1)$r.squared)
## [1] 0.230255
#b_Age is negative
cor(mydata3$Price, mydata3$Age)
## [1] -0.230255
Linear relationship between Price and Age of Ljubljana’s apartments is negative and weak.
library(car)
scatterplotMatrix(mydata3[ , c(3,1,2)],
smooth = FALSE)
There does not seem to be a problem with multicolinearity, since points are not that close to the regression line (on graph Age~Distance).
fit2 <- lm(Price ~ Age + Distance,
data = mydata3)
vif(fit2)
## Age Distance
## 1.001845 1.001845
mean(vif(fit2))
## [1] 1.001845
VIF for both Age and Distance is less than 5, additionally their average is very near 1.00, which means multicolinearity is not too great.
mydata3$StdResiduals <- round(rstandard(fit2), 3)
head(mydata3[order(mydata3$StdResiduals),])
## # A tibble: 6 x 8
## Age Distance Price Parking Balcony ParkingF BalconyF StdResiduals
## <dbl> <dbl> <dbl> <dbl> <dbl> <fct> <fct> <dbl>
## 1 7 2 1760 0 1 No Yes -2.15
## 2 12 14 1650 0 1 No Yes -1.50
## 3 12 14 1650 0 0 No No -1.50
## 4 13 8 1800 0 0 No No -1.38
## 5 14 16 1660 0 1 No Yes -1.26
## 6 24 5 1830 1 0 Yes No -1.19
head(mydata3[order(-mydata3$StdResiduals),])
## # A tibble: 6 x 8
## Age Distance Price Parking Balcony ParkingF BalconyF StdResiduals
## <dbl> <dbl> <dbl> <dbl> <dbl> <fct> <fct> <dbl>
## 1 5 45 2180 1 1 Yes Yes 2.58
## 2 2 11 2790 1 0 Yes No 2.05
## 3 18 1 2800 1 0 Yes No 1.78
## 4 18 1 2800 1 1 Yes Yes 1.78
## 5 8 2 2820 1 0 Yes No 1.66
## 6 10 1 2810 0 0 No No 1.60
There is no problematic outliers, since every standardized residual is within [-3,3].
mydata3$CooksDist <- round(cooks.distance(fit2), 3)
hist(mydata3$CooksDist,
xlab = "Cooks distance",
ylab = "Frequency",
main = "Histogram of Cooks distances")
Every unit has cooks distance below 1.00, which is why none must necessarily be removed, however, we can remove the outstanding one, because it is much bigger than any other.
mydata3$ID <- 1:nrow(mydata3)
head(mydata3[order(-mydata3$CooksDist),])
## # A tibble: 6 x 10
## Age Distance Price Parking Balcony ParkingF BalconyF StdResiduals CooksDist ID
## <dbl> <dbl> <dbl> <dbl> <dbl> <fct> <fct> <dbl> <dbl> <int>
## 1 5 45 2180 1 1 Yes Yes 2.58 0.32 38
## 2 43 37 1740 0 0 No No 1.44 0.104 55
## 3 2 11 2790 1 0 Yes No 2.05 0.069 33
## 4 7 2 1760 0 1 No Yes -2.15 0.066 53
## 5 37 3 2540 1 1 Yes Yes 1.58 0.061 22
## 6 40 2 2400 0 1 No Yes 1.09 0.038 39
mydata3 <- mydata3[-38,]
fit2 <- lm(Price ~ Age + Distance,
data = mydata3)
#calculating CooksDist once again, so there are correct values within the table
mydata3$CooksDist <- round(cooks.distance(fit2), 3)
mydata3$StdFittedValues <- round(scale(fit2$fitted.values), 3)
mydata3$StdResiduals <- round(rstandard(fit2), 3)
library(car)
scatterplot(y = mydata3$StdResiduals, x = mydata3$StdFittedValues,
ylab = "Standardized residuals",
xlab = "Standardized fitted values",
boxplots = FALSE,
regLine = FALSE,
smooth = FALSE)
From the graph it is possible to see that units on the left might be a bit less dispersed than on the right, but it could still be too little to truly indicate heteroskedasticity. We could just check it with Breusch Pagan test:
suppressPackageStartupMessages(library(olsrr))
ols_test_breusch_pagan(fit2)
##
## Breusch Pagan Test for Heteroskedasticity
## -----------------------------------------
## Ho: the variance is constant
## Ha: the variance is not constant
##
## Data
## ---------------------------------
## Response : Price
## Variables: fitted values of Price
##
## Test Summary
## -----------------------------
## DF = 1
## Chi2 = 2.927455
## Prob > Chi2 = 0.08708469
We cannot reject the H0, which means there is no heteroskedasticity (p=0.0871).
hist(mydata3$StdResiduals,
xlab = "Standardized residuals",
ylab = "Frequency",
main = "Histogram of standardized residuals")
shapiro.test(mydata3$StdResiduals)
##
## Shapiro-Wilk normality test
##
## data: mydata3$StdResiduals
## W = 0.95649, p-value = 0.006355
Already the histogram does not indicate that standardizes residuals are distributed normally, and formal Shapiro_Wilk normality test only confirms our suspicions. We reject H0: distribution is normal; at p=0.0064 We also have less than 100 sample units which does not help our problem. The following conclusions based on this model are therefore unreliable.
fit2 <- lm(Price ~ Age + Distance,
data = mydata3)
summary(fit2)
##
## Call:
## lm(formula = Price ~ Age + Distance, data = mydata3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -604.92 -229.63 -56.49 192.97 599.35
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2456.076 73.931 33.221 < 2e-16 ***
## Age -6.464 3.159 -2.046 0.044 *
## Distance -22.955 2.786 -8.240 2.52e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 276.1 on 81 degrees of freedom
## Multiple R-squared: 0.4838, Adjusted R-squared: 0.4711
## F-statistic: 37.96 on 2 and 81 DF, p-value: 2.339e-12
If age of an apartment in Ljubljana region is increased by 1 year then its price gets on average decreased by 6.464 EUR per m2, provided everything else stays the same. (p=0.044)
If distance from the apartment to the city center is increased by 1 km then its price gets on average decreased by 22.955 EUR per m2, provided everything else stays the same. (p<0.001)
48.38% of Ljubljana apartment’s price variability is explained by linear effect of apartment’s age and distance from the city center. (p<0.001)
sqrt(summary(fit2)$r.squared)
## [1] 0.6955609
The linear relationship between Price and both explanatory variables is semi strong.
fit3 <- lm(Price ~ Age + Distance + ParkingF + BalconyF,
data = mydata3)
anova(fit2, fit3)
## Analysis of Variance Table
##
## Model 1: Price ~ Age + Distance
## Model 2: Price ~ Age + Distance + ParkingF + BalconyF
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 81 6176767
## 2 79 5654480 2 522287 3.6485 0.03051 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Model fit3 fits data better (p=0.03051).
summary(fit3)
##
## Call:
## lm(formula = Price ~ Age + Distance + ParkingF + BalconyF, data = mydata3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -473.21 -192.37 -28.89 204.17 558.77
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2329.724 93.066 25.033 < 2e-16 ***
## Age -5.821 3.074 -1.894 0.06190 .
## Distance -20.279 2.886 -7.026 6.66e-10 ***
## ParkingFYes 167.531 62.864 2.665 0.00933 **
## BalconyFYes -15.207 59.201 -0.257 0.79795
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 267.5 on 79 degrees of freedom
## Multiple R-squared: 0.5275, Adjusted R-squared: 0.5035
## F-statistic: 22.04 on 4 and 79 DF, p-value: 3.018e-12
Given the age, distance and presence of a balcony stay the same, price (per m2) of apartments with a parking option is on average higher by 167.531 EUR than price (per m2) of apartments without this option. (p=0.00933)
We could not confirm that a balcony presence affects the price of an apartment in Ljubljana. (p=0.79795)
The hypothesis being tested with F-statistics: H0: Ro2 = 0 H1: Ro2 =/ 0
mydata3$FittedValues <- fitted.values(fit3)
mydata3$Residuals <- mydata3$Price - mydata3$FittedValues
mydata3[2,13]
## # A tibble: 1 x 1
## Residuals
## <dbl>
## 1 428.