1

1.1 適合度検定

nankai <- data.frame(month = 1:12, frequency=c(0,2,0,0,0,0,0,2,1,1,1,5))

barplot(height = nankai$frequency,
        names.arg = nankai$month, xlab = "month", ylab = "frequency")

chisq.test(nankai$frequency,p=rep(1/12,length=12))
## Warning in chisq.test(nankai$frequency, p = rep(1/12, length = 12)): Chi-squared
## approximation may be incorrect
## 
##  Chi-squared test for given probabilities
## 
## data:  nankai$frequency
## X-squared = 24, df = 11, p-value = 0.01273

\[結論 帰無仮説H_0:各々の月に地震が起きる確率が12分の1は棄却される。\]

# accident_count 事故回数
# year_count 該当する年の数

df <- data.frame(accident_count = 0:8, year_count=c(1,6,6,8,5,7,0,1,0))
m <- sum((0:8)*df$year_count)/sum(df$year_count)

print(paste("最尤推定量λ:",m))
## [1] "最尤推定量λ: 3.05882352941176"

\[事故回数を8回以上と解釈した場合(自由度8)の検定\]

poisprob<-dpois(0:7,lambda=m)
theory<-c(poisprob,1-sum(poisprob))
chisq.test(df$year_count,p=theory)
## Warning in chisq.test(df$year_count, p = theory): Chi-squared approximation may
## be incorrect
## 
##  Chi-squared test for given probabilities
## 
## data:  df$year_count
## X-squared = 6.5411, df = 8, p-value = 0.5869

\[9回以上の実現値を0と解釈した場合(自由度9)の検定\]

poisprob2<-dpois(0:8,lambda=m)
theory2<-c(poisprob2,1-sum(poisprob2))
years2<-c(df$year_count,0)

chisq.test(years2,p=theory2)
## Warning in chisq.test(years2, p = theory2): Chi-squared approximation may be
## incorrect
## 
##  Chi-squared test for given probabilities
## 
## data:  years2
## X-squared = 6.5411, df = 9, p-value = 0.6848

\[χ二乗統計量の分布\]

N <- 1000
p <- c(4,3,2,1)/10
x <- rmultinom(N, size=10, prob=p)
expmatrix <- rep(N*p,N)
z <- x - expmatrix
chisqval <- apply(z*z/expmatrix, 2, sum)
hist(chisqval)

2 章 分割表の検定(2)

library(MASS)
## Warning: パッケージ 'MASS' はバージョン 4.1.3 の R の下で造られました
SmokeEx <- table(survey$Smoke, survey$Exer)

chisq.test(SmokeEx)
## Warning in chisq.test(SmokeEx): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  SmokeEx
## X-squared = 5.4885, df = 6, p-value = 0.4828

2.1 イエーツの補正有無でP値を比較

sb<-matrix(c(1085,703,55623,441239),2,2)
chisq.test(sb,correct=FALSE)
## 
##  Pearson's Chi-squared test
## 
## data:  sb
## X-squared = 4328.9, df = 1, p-value < 2.2e-16
# correct=TRUE
chisq.test(sb)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  sb
## X-squared = 4324, df = 1, p-value < 2.2e-16

2.2 母比率の差の検定

prop.test(c(8,13),c(20,20))
## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  c(8, 13) out of c(20, 20)
## X-squared = 1.604, df = 1, p-value = 0.2053
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  -0.59965664  0.09965664
## sample estimates:
## prop 1 prop 2 
##   0.40   0.65
bluepen<-matrix(c(8,12,13,7),2,2)
chisq.test(bluepen)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  bluepen
## X-squared = 1.604, df = 1, p-value = 0.2053

2.3 フィッシャーの正確検定

fisher.test(bluepen)
## 
##  Fisher's Exact Test for Count Data
## 
## data:  bluepen
## p-value = 0.2049
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.08194956 1.53121239
## sample estimates:
## odds ratio 
##  0.3687025

2.4 フィッシャーの正確検定の計算原理 SKIP

2.5 独立性の検定が役に立つ場合

mrecall<-matrix(c(14,17,6,2),2,2)

chisq.test(mrecall)
## Warning in chisq.test(mrecall): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  mrecall
## X-squared = 1.2292, df = 1, p-value = 0.2676
chisq.test(mrecall,correct=FALSE)  # イエーツの補正なし
## Warning in chisq.test(mrecall, correct = FALSE): Chi-squared approximation may
## be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  mrecall
## X-squared = 2.2662, df = 1, p-value = 0.1322
fisher.test(mrecall)
## 
##  Fisher's Exact Test for Count Data
## 
## data:  mrecall
## p-value = 0.2351
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.02431935 1.91518631
## sample estimates:
## odds ratio 
##  0.2835388

2.6 残差分析

メモ

  • χ二乗検定→有意→標準化残差の絶対値の上側累積確率のチェック
MF <- matrix(c(482, 532, 303, 390, 30, 38, 29, 62), 4, 2,
  dimnames = list(c("機械", "電気", "電子", "環境"), c("男性", "女性"))
)

res <- chisq.test(MF)
res$stdres
##            男性       女性
## 機械  2.5322605 -2.5322605
## 電気  1.9026177 -1.9026177
## 電子 -0.1540644  0.1540644
## 環境 -4.5452585  4.5452585
# 正規分布の上側累積確率を計算
p.value.matrix <- pnorm(abs(res$stdres),lower.tail=FALSE)*2

round(p.value.matrix, 4)*100
##       男性  女性
## 機械  1.13  1.13
## 電気  5.71  5.71
## 電子 87.76 87.76
## 環境  0.00  0.00

3 章 単回帰

3.1 近似直線の導出

# speed(時速)、dist(停止までに走行する距離、ft)
res<-lm(formula = dist~speed, data=cars)
plot(cars)
abline(res)

summary(res)
## 
## Call:
## lm(formula = dist ~ speed, data = cars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -29.069  -9.525  -2.272   9.215  43.201 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -17.5791     6.7584  -2.601   0.0123 *  
## speed         3.9324     0.4155   9.464 1.49e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.38 on 48 degrees of freedom
## Multiple R-squared:  0.6511, Adjusted R-squared:  0.6438 
## F-statistic: 89.57 on 1 and 48 DF,  p-value: 1.49e-12

3.2 Rにおける決定係数

summary(res)
## 
## Call:
## lm(formula = dist ~ speed, data = cars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -29.069  -9.525  -2.272   9.215  43.201 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -17.5791     6.7584  -2.601   0.0123 *  
## speed         3.9324     0.4155   9.464 1.49e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.38 on 48 degrees of freedom
## Multiple R-squared:  0.6511, Adjusted R-squared:  0.6438 
## F-statistic: 89.57 on 1 and 48 DF,  p-value: 1.49e-12
# Call:
# lm(formula = dist ~ speed, data = cars)
# 
# Residuals:
#     Min      1Q  Median      3Q     Max 
# -29.069  -9.525  -2.272   9.215  43.201 
# 
# Coefficients:
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept) -17.5791     6.7584  -2.601   0.0123 *  
# speed         3.9324     0.4155   9.464 1.49e-12 ***
# ---
# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
# 
# Residual standard error: 15.38 on 48 degrees of freedom
# Multiple R-squared:  0.6511,  Adjusted R-squared:  0.6438 
# F-statistic: 89.57 on 1 and 48 DF,  p-value: 1.49e-12
# -1とすることで切片を0とする回帰になる
res<-lm(formula = dist~speed - 1, data=cars)
plot(cars,xlim=c(0,25))
abline(res)

summary(res)
## 
## Call:
## lm(formula = dist ~ speed - 1, data = cars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -26.183 -12.637  -5.455   4.590  50.181 
## 
## Coefficients:
##       Estimate Std. Error t value Pr(>|t|)    
## speed   2.9091     0.1414   20.58   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16.26 on 49 degrees of freedom
## Multiple R-squared:  0.8963, Adjusted R-squared:  0.8942 
## F-statistic: 423.5 on 1 and 49 DF,  p-value: < 2.2e-16

3.3 説明変数、被説明変数のとり方による直線の違い

メモ X⇒YとY⇒Xは等しくないし、フィッティングさせた直線の解釈も異なる。

# sampledata2.xlsxのTIMSS2011シートのV列
likemath <- read.csv("likemath.csv",header=FALSE)
scoremath <- read.csv("scoremath.csv",header=FALSE)
plot(likemath$V1,scoremath$V1)
res<-lm(scoremath$V1~likemath$V1)
abline(res)
summary(res)  # y = 679.1611 -3.1604x
## 
## Call:
## lm(formula = scoremath$V1 ~ likemath$V1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -113.609  -41.289   -7.856   29.476  172.032 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 679.1611    42.7252  15.896  < 2e-16 ***
## likemath$V1  -3.1604     0.6045  -5.228 4.78e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 56.62 on 43 degrees of freedom
## Multiple R-squared:  0.3887, Adjusted R-squared:  0.3744 
## F-statistic: 27.34 on 1 and 43 DF,  p-value: 4.775e-06
# 平均点が上がるほど数学が嫌いになる。と考え説明変数と独立変数を入れ替えた場合:
res2<-lm(likemath$V1~scoremath$V1)
summary(res2)
## 
## Call:
## lm(formula = likemath$V1 ~ scoremath$V1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -26.777  -8.427   1.854   7.745  25.258 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  125.87889   10.95082  11.495 1.07e-14 ***
## scoremath$V1  -0.12297    0.02352  -5.228 4.78e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.17 on 43 degrees of freedom
## Multiple R-squared:  0.3887, Adjusted R-squared:  0.3744 
## F-statistic: 27.34 on 1 and 43 DF,  p-value: 4.775e-06
# summaryより、
# x = 125.87889 - 0.12297y
# ⇔ y = (125.87889-X)/0.12297 = 1023.655 - 8.132065X
abline(a=1023.655,b=-8.132065,lty="dotted")

# ↑で引かれた直線は横方向の残差平方和を最小化する

3.4 外れ値の影響→SKIP

4 章 赤池情報量基準によるモデル選択

メモ: - 決定係数: 説明力のみ - +汎用性の判断→AIC

4.1 cars再考

res<-lm(formula = dist~I(speed^2), data=cars)
plot(cars)
lines(cars$speed,fitted(res))

summary(res)
## 
## Call:
## lm(formula = dist ~ I(speed^2), data = cars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -28.448  -9.211  -3.594   5.076  45.862 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  8.86005    4.08633   2.168   0.0351 *  
## I(speed^2)   0.12897    0.01319   9.781  5.2e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.05 on 48 degrees of freedom
## Multiple R-squared:  0.6659, Adjusted R-squared:  0.6589 
## F-statistic: 95.67 on 1 and 48 DF,  p-value: 5.2e-13
# 2次式
res <- lm(formula = dist ~ speed + I(speed^2), data = cars)
plot(cars)
lines(cars$speed, fitted(res), lty = 2) # 線のスタイルを点線に設定
summary(res)
## 
## Call:
## lm(formula = dist ~ speed + I(speed^2), data = cars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -28.720  -9.184  -3.188   4.628  45.152 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)  2.47014   14.81716   0.167    0.868
## speed        0.91329    2.03422   0.449    0.656
## I(speed^2)   0.09996    0.06597   1.515    0.136
## 
## Residual standard error: 15.18 on 47 degrees of freedom
## Multiple R-squared:  0.6673, Adjusted R-squared:  0.6532 
## F-statistic: 47.14 on 2 and 47 DF,  p-value: 5.852e-12
# 3次式
res <- lm(formula = dist ~ speed + I(speed^2) + I(speed^3), data = cars)
lines(cars$speed, fitted(res), col = "red") # 線のスタイルを点線に設定

summary(res)
## 
## Call:
## lm(formula = dist ~ speed + I(speed^2) + I(speed^3), data = cars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -26.670  -9.601  -2.231   7.075  44.691 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)
## (Intercept) -19.50505   28.40530  -0.687    0.496
## speed         6.80111    6.80113   1.000    0.323
## I(speed^2)   -0.34966    0.49988  -0.699    0.488
## I(speed^3)    0.01025    0.01130   0.907    0.369
## 
## Residual standard error: 15.2 on 46 degrees of freedom
## Multiple R-squared:  0.6732, Adjusted R-squared:  0.6519 
## F-statistic: 31.58 on 3 and 46 DF,  p-value: 3.074e-11
# 4次式
res <- lm(formula = dist ~ speed + I(speed^2) + I(speed^3) + I(speed^4), data = cars)
lines(cars$speed, fitted(res), col = "blue") # 線のスタイルを点線に設定
legend("topleft",legend = c("2次式", "3次式", "4次式"),
  col = c("black", "red", "blue"), lty = c(1, 2)
)

summary(res)
## 
## Call:
## lm(formula = dist ~ speed + I(speed^2) + I(speed^3) + I(speed^4), 
##     data = cars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -23.701  -8.766  -2.861   7.158  42.186 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)
## (Intercept)  45.845412  60.849115   0.753    0.455
## speed       -18.962244  22.296088  -0.850    0.400
## I(speed^2)    2.892190   2.719103   1.064    0.293
## I(speed^3)   -0.151951   0.134225  -1.132    0.264
## I(speed^4)    0.002799   0.002308   1.213    0.232
## 
## Residual standard error: 15.13 on 45 degrees of freedom
## Multiple R-squared:  0.6835, Adjusted R-squared:  0.6554 
## F-statistic:  24.3 on 4 and 45 DF,  p-value: 9.375e-11

4.2 AIC(赤池情報量基準)

res1 <- lm(formula = dist ~ speed, data = cars)
res2 <- lm(formula = dist ~ poly(speed, 2, raw = TRUE), data = cars)
res3 <- lm(formula = dist ~ poly(speed, 3, raw = TRUE), data = cars)
res4 <- lm(formula = dist ~ poly(speed, 4, raw = TRUE), data = cars)

summary(res1)$r.squared; summary(res1)$adj.r.squared
## [1] 0.6510794
## [1] 0.6438102
summary(res2)$r.squared; summary(res2)$adj.r.squared
## [1] 0.6673308
## [1] 0.6531747
summary(res3)$r.squared; summary(res3)$adj.r.squared
## [1] 0.6731808
## [1] 0.6518666
summary(res4)$r.squared; summary(res4)$adj.r.squared
## [1] 0.6835237
## [1] 0.6553925

次数をあげて推定パラメタを増やすと決定係数は改善できるがOverfittingに繋がる。
→どのように適切な推定パラメタを選ぶか?(モデル選択問題)
→AICを使う(AICが最小のものを選択する)

MLL: 最大対数尤度
モデルの当てはまりの良さ→AICを小さくする
パラメータ数Kの多さ→AICを大きくする

\[\begin{align*} AIC = -2MLL+2k \\ \end{align*}\]

AIC(res1,res2,res3,res4)
##      df      AIC
## res1  3 419.1569
## res2  4 418.7721
## res3  5 419.8850
## res4  6 420.2771
res02 <- lm(formula = dist ~ poly(speed, 2, raw = TRUE)-1, data = cars)
summary(res02)
## 
## Call:
## lm(formula = dist ~ poly(speed, 2, raw = TRUE) - 1, data = cars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -28.836  -9.071  -3.152   4.570  44.986 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)   
## poly(speed, 2, raw = TRUE)1  1.23903    0.55997   2.213  0.03171 * 
## poly(speed, 2, raw = TRUE)2  0.09014    0.02939   3.067  0.00355 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.02 on 48 degrees of freedom
## Multiple R-squared:  0.9133, Adjusted R-squared:  0.9097 
## F-statistic: 252.8 on 2 and 48 DF,  p-value: < 2.2e-16
AIC(res02)
## [1] 416.8016

5 章 線形モデル

これは直線の方程式 \(y = mx + b\) です。

5.1 線形モデルの定式化

次のように表せると仮定 \[ y_j = α + βx_j + ε_j,\quad j=1,2,...,n\\ \] また、\(E[ε_j]=0,V[ε_j]=σ^2\)を満たすものとする。このようなモデルを線形単回帰モデルという。 定理7.

6 章 曲線へのあてはめ

6.1 lmを用いた曲線あてはめが上手く行く場合

\[y=1+2sinx+3cos2x+ε,\quad ε \sim N(0,0.2)\]

x <- seq(0, 2 * pi, length = 100)

y <- 1 + 2 * sin(x) + 3 * cos(2 * x) + rnorm(length(x), 0, 0.2)

# データフレームを作成する
data <- data.frame(x = x, y = y)

plot(data$x, data$y)
res <- lm(y ~ I(sin(x)) + I(cos(2 * x)))
summary(res)
## 
## Call:
## lm(formula = y ~ I(sin(x)) + I(cos(2 * x)))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.54133 -0.12554  0.01487  0.15706  0.51187 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    1.03116    0.02128   48.46   <2e-16 ***
## I(sin(x))      2.00412    0.03024   66.28   <2e-16 ***
## I(cos(2 * x))  3.04184    0.02994  101.60   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2127 on 97 degrees of freedom
## Multiple R-squared:  0.9935, Adjusted R-squared:  0.9933 
## F-statistic:  7357 on 2 and 97 DF,  p-value: < 2.2e-16
lines(x, fitted(res))

\[y=f(x)=\frac{a}{1+b e^{-c x}}\]

diffusion_rate <- read.csv("PC普及率.csv", header = FALSE)
colnames(diffusion_rate) <- c("year", "rate")

plot(diffusion_rate$year, diffusion_rate$rate)

year <- diffusion_rate$year
rate <- diffusion_rate$rate

year0 <- year - 1987

res <- nls(rate ~ a / (1 + b * exp(-c * year0)), start = c(a = 80, b = 1, c = 1), trace = TRUE)
## 50139.17    (1.55e+00): par = (80 1 1)
## 29739.03    (2.37e+00): par = (72.51707 1.034615 0.3826346)
## 11711.30    (2.91e+00): par = (70.47085 1.067224 0.07099902)
## 6664.014    (2.19e+00): par = (104.8972 3.047831 0.0650449)
## 5063.070    (2.03e+00): par = (84.22802 3.25924 0.09463527)
## 2410.674    (1.50e+00): par = (83.88331 5.251473 0.1400597)
## 1394.916    (1.26e+00): par = (86.58721 10.59881 0.199742)
## 641.6215    (5.63e-01): par = (84.33738 16.97232 0.2257461)
## 493.8297    (2.31e-01): par = (82.2809 23.18385 0.2477569)
## 467.4292    (7.20e-02): par = (81.62738 27.33159 0.2584358)
## 464.6063    (2.08e-02): par = (81.29687 29.28643 0.2635173)
## 464.3555    (6.05e-03): par = (81.18931 29.99237 0.2653881)
## 464.3336    (1.77e-03): par = (81.15613 30.21454 0.2659927)
## 464.3318    (5.15e-04): par = (81.14652 30.28061 0.2661735)
## 464.3316    (1.49e-04): par = (81.14373 30.29992 0.2662265)
## 464.3316    (4.33e-05): par = (81.14292 30.30553 0.2662419)
## 464.3316    (1.26e-05): par = (81.14268 30.30716 0.2662464)
## 464.3316    (3.63e-06): par = (81.14261 30.30763 0.2662477)
summary(res)
## 
## Formula: rate ~ a/(1 + b * exp(-c * year0))
## 
## Parameters:
##   Estimate Std. Error t value Pr(>|t|)    
## a 81.14261    2.54320  31.906  < 2e-16 ***
## b 30.30763    7.53606   4.022 0.000469 ***
## c  0.26625    0.02304  11.557  1.6e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.31 on 25 degrees of freedom
## 
## Number of iterations to convergence: 17 
## Achieved convergence tolerance: 3.628e-06
lines(year, fitted(res))

6.1.1 上手くいかない例

memo algorithm=Newton法。誤差0のデータや二乗和の最小化以外のタスクには適用不可

### 1.素直にyear0を使うと上手くいかない
# res <- nls(rate ~ a / (1 + b * exp(-c * year)), start = c(a = 80, b = 1, c = 1), trace = TRUE)

# Error in nlsModel(formula, mf, start, wts, scaleOffset = scOff, nDcentral = nDcntr) :
#  singular gradient matrix at initial parameter estimates

### 2.初期値によっては収束しない
# res <- nls(rate ~ a / (1 + b * exp(-c * year0)), start = c(a = 80, b = 0.1, c = 0.1), trace = TRUE)

# Error in nls(rate ~ a/(1 + b * exp(-c * year0)), start = c(a = 80, b = 0.1,  : 
#    step 因子 0.000488281 は 0.000976562 の 'minFactor' 以下に縮小しました 

6.2 変数変換+直線回帰

city <- read.csv("population.csv", header = FALSE)
city <- city$V1

ord<-order(city,decreasing=TRUE)
city_popl<-city[ord]
rank<-1:21

plot(rank, city_popl)

7

7.1

8

8.1

9

9.1

10

10.1

11

11.1

12

12.1

13

13.1

14

14.1

15

15.1

16

16.1

17

18

19

20

21

21.1

21.2

21.3

21.4

21.5