I have installed two new pakages from library.’Arules’ which is used for mining association rules and frequent itemsets.’ArulesViz’ is for visualizing association rules and frequent itemsets.
library(arules)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Attaching package: 'arules'
## The following object is masked from 'package:dplyr':
##
## recode
## The following objects are masked from 'package:base':
##
## abbreviate, write
library(arulesViz)
smalldf <- smalldf %>%
select(-c(short_description_en,justification_en,date_end))
colnames(smalldf)
## [1] "category" "states_name_en" "region_en" "unique_number"
## [5] "id_no" "rev_bis" "name_en" "date_inscribed"
## [9] "secondary_dates" "danger" "danger_list" "longitude"
## [13] "latitude" "area_hectares" "criteria_txt" "category_short"
## [17] "iso_code" "udnp_code" "transboundary"
transactions(smalldf)
## Warning: Column(s) 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
## 17, 18, 19 not logical or factor. Applying default discretization (see '?
## discretizeDF').
## Warning in discretize(x = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, : The calculated breaks are: 0, 0, 0, 1
## Only unique breaks are used reducing the number of intervals. Look at ? discretize for details.
## Warning in discretize(x = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, : The calculated breaks are: 0, 0, 0, 1
## Only unique breaks are used reducing the number of intervals. Look at ? discretize for details.
## transactions in sparse format with
## 45 transactions (rows) and
## 207 items (columns)
colnames(smalldf)[c(1,2,3,4,8,11,12)]
## [1] "category" "states_name_en" "region_en" "unique_number"
## [5] "date_inscribed" "danger_list" "longitude"
smalldf <- smalldf %>% mutate(
danger = (danger > 0),
date_inscribed = (date_inscribed >0)
)
trans <- transactions(smalldf)
## Warning: Column(s) 1, 2, 3, 4, 5, 6, 7, 9, 11, 12, 13, 14, 15, 16, 17, 18, 19
## not logical or factor. Applying default discretization (see '? discretizeDF').
## Warning in discretize(x = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, : The calculated breaks are: 0, 0, 0, 1
## Only unique breaks are used reducing the number of intervals. Look at ? discretize for details.
Here it shows errors! We can convert them into factors (or Boolean) for analysis.
as(df,"transactions")
## Warning: Column(s) 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
## 18, 19, 20, 21, 22 not logical or factor. Applying default discretization (see
## '? discretizeDF').
## Warning in discretize(x = c(1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, : The calculated breaks are: 0, 0, 0, 1
## Only unique breaks are used reducing the number of intervals. Look at ? discretize for details.
## Warning in discretize(x = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, : The calculated breaks are: 0, 0, 0, 1
## Only unique breaks are used reducing the number of intervals. Look at ? discretize for details.
## transactions in sparse format with
## 1121 transactions (rows) and
## 3429 items (columns)
summary(trans)
## transactions as itemMatrix in sparse format with
## 45 rows (elements/itemsets/transactions) and
## 205 columns (items) and a density of 0.08791328
##
## most frequent items:
## date_inscribed transboundary=[0,1] danger_list= secondary_dates=
## 45 45 42 40
## category=Cultural (Other)
## 35 604
##
## element (itemset/transaction) length distribution:
## sizes
## 18 19
## 44 1
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 18.00 18.00 18.00 18.02 18.00 19.00
##
## includes extended item information - examples:
## labels variables levels
## 1 category=Cultural category Cultural
## 2 category=Mixed category Mixed
## 3 category=Natural category Natural
##
## includes extended transaction information - examples:
## transactionID
## 1 1
## 2 2
## 3 3
colnames(trans)
## [1] "category=Cultural"
## [2] "category=Mixed"
## [3] "category=Natural"
## [4] "states_name_en=Afghanistan"
## [5] "states_name_en=Argentina,Brazil"
## [6] "states_name_en=Azerbaijan"
## [7] "states_name_en=Brazil"
## [8] "states_name_en=Canada"
## [9] "states_name_en=China"
## [10] "states_name_en=Croatia"
## [11] "states_name_en=Czechia"
## [12] "states_name_en=Finland"
## [13] "states_name_en=Germany"
## [14] "states_name_en=Greece"
## [15] "states_name_en=Italy"
## [16] "states_name_en=Italy,Switzerland"
## [17] "states_name_en=Japan"
## [18] "states_name_en=Lao People's Democratic Republic"
## [19] "states_name_en=Mexico"
## [20] "states_name_en=Mongolia"
## [21] "states_name_en=New Zealand"
## [22] "states_name_en=Peru"
## [23] "states_name_en=Republic of Korea"
## [24] "states_name_en=Russian Federation"
## [25] "states_name_en=Saudi Arabia"
## [26] "states_name_en=Spain"
## [27] "states_name_en=Tajikistan"
## [28] "states_name_en=Togo"
## [29] "states_name_en=United Kingdom of Great Britain and Northern Ireland"
## [30] "states_name_en=United Republic of Tanzania"
## [31] "states_name_en=United States of America"
## [32] "states_name_en=Yemen"
## [33] "region_en=Africa"
## [34] "region_en=Arab States"
## [35] "region_en=Asia and the Pacific"
## [36] "region_en=Europe and North America"
## [37] "region_en=Latin America and the Caribbean"
## [38] "unique_number=[234,1.02e+03)"
## [39] "unique_number=[1.02e+03,1.65e+03)"
## [40] "unique_number=[1.65e+03,2.32e+03]"
## [41] "id_no=[39,777)"
## [42] "id_no=[777,1.21e+03)"
## [43] "id_no=[1.21e+03,1.56e+03]"
## [44] "rev_bis="
## [45] "rev_bis=-894 Rev"
## [46] "rev_bis=bis"
## [47] "rev_bis=Bis"
## [48] "rev_bis=rev"
## [49] "rev_bis=Rev"
## [50] "rev_bis=ter"
## [51] "name_en=Al-Ahsa Oasis, an Evolving Cultural Landscape"
## [52] "name_en=Archaeological Area and the Patriarchal Basilica of Aquileia"
## [53] "name_en=Archaeological Areas of Pompei, Herculaneum and Torre Annunziata"
## [54] "name_en=Archaeological Site of Delphi"
## [55] "name_en=At-Turaif District in ad-Dir'iyah"
## [56] "name_en=Atlantic Forest South-East Reserves"
## [57] "name_en=Canterbury Cathedral, St Augustine's Abbey, and St Martin's Church"
## [58] "name_en=Ensemble of the Novodevichy Convent"
## [59] "name_en=Gardens and Castle at KroměřÞ"
## [60] "name_en=Gobustan Rock Art Cultural Landscape"
## [61] "name_en=Hidden Christian Sites in the Nagasaki Region"
## [62] "name_en=Historic Centre of Lima"
## [63] "name_en=Historic Centre of the Town of Diamantina"
## [64] "name_en=Iguaçu National Park"
## [65] "name_en=Iwami Ginzan Silver Mine and its Cultural Landscape"
## [66] "name_en=Jesuit Missions of the Guaranis: San Ignacio Mini, Santa Ana, Nuestra Señora de Loreto and Santa Maria Mayor (Argentina), Ruins of Sao Miguel das Missoes (Brazil)"
## [67] "name_en=Koutammakou, the Land of the Batammariba"
## [68] "name_en=Maritime Greenwich"
## [69] "name_en=Minaret and Archaeological Remains of Jam"
## [70] "name_en=Mines of Rammelsberg, Historic Town of Goslar and Upper Harz Water Management System"
## [71] "name_en=New Zealand Sub-Antarctic Islands"
## [72] "name_en=Ngorongoro Conservation Area"
## [73] "name_en=Old Town Lunenburg"
## [74] "name_en=Old Town of Lijiang"
## [75] "name_en=Old town of Regensburg with Stadtamhof"
## [76] "name_en=Paraty and Ilha Grande – Culture and Biodiversity"
## [77] "name_en=Petäjävesi Old Church"
## [78] "name_en=Petroglyphic Complexes of the Mongolian Altai"
## [79] "name_en=Pre-Hispanic City of Teotihuacan"
## [80] "name_en=Qinghai Hoh Xil"
## [81] "name_en=Rhaetian Railway in the Albula / Bernina Landscapes"
## [82] "name_en=RÃo Abiseo National Park"
## [83] "name_en=Royal Botanic Gardens, Kew"
## [84] "name_en=Royal Monastery of Santa MarÃa de Guadalupe"
## [85] "name_en=Royal Tombs of the Joseon Dynasty"
## [86] "name_en=Sacred Sites and Pilgrimage Routes in the Kii Mountain Range"
## [87] "name_en=San Antonio Missions"
## [88] "name_en=Sansa, Buddhist Mountain Monasteries in Korea"
## [89] "name_en=Shiretoko"
## [90] "name_en=Socotra Archipelago"
## [91] "name_en=Stari Grad Plain"
## [92] "name_en=Studley Royal Park including the Ruins of Fountains Abbey"
## [93] "name_en=Tajik National Park (Mountains of the Pamirs)"
## [94] "name_en=Tower of Hercules"
## [95] "name_en=Vat Phou and Associated Ancient Settlements within the Champasak Cultural Landscape"
## [96] "date_inscribed"
## [97] "secondary_dates="
## [98] "secondary_dates=1984"
## [99] "secondary_dates=1991"
## [100] "secondary_dates=1992"
## [101] "secondary_dates=2010"
## [102] "danger"
## [103] "danger_list="
## [104] "danger_list=P 1984-1989"
## [105] "danger_list=P 1999-2001"
## [106] "danger_list=Y 2002"
## [107] "longitude=[-98.8,0.721)"
## [108] "longitude=[0.721,49.5)"
## [109] "longitude=[49.5,166]"
## [110] "latitude=[-50.8,26.4)"
## [111] "latitude=[26.4,43.2)"
## [112] "latitude=[43.2,62.2]"
## [113] "area_hectares=[0,141)"
## [114] "area_hectares=[141,3.12e+03)"
## [115] "area_hectares=[3.12e+03,3.74e+06]"
## [116] "criteria_txt=(i)(ii)(iii)(iv)"
## [117] "criteria_txt=(i)(ii)(iii)(iv)(vi)"
## [118] "criteria_txt=(i)(ii)(iv)(vi)"
## [119] "criteria_txt=(i)(ii)(vi)"
## [120] "criteria_txt=(i)(iv)"
## [121] "criteria_txt=(i)(iv)(vi)"
## [122] "criteria_txt=(ii)"
## [123] "criteria_txt=(ii)(iii)(iv)"
## [124] "criteria_txt=(ii)(iii)(iv)(vi)"
## [125] "criteria_txt=(ii)(iii)(v)"
## [126] "criteria_txt=(ii)(iv)"
## [127] "criteria_txt=(ii)(iv)(v)"
## [128] "criteria_txt=(iii)"
## [129] "criteria_txt=(iii)(iv)(v)"
## [130] "criteria_txt=(iii)(iv)(vi)"
## [131] "criteria_txt=(iii)(vii)(ix)(x)"
## [132] "criteria_txt=(iv)"
## [133] "criteria_txt=(iv)(v)"
## [134] "criteria_txt=(iv)(v)(vi)"
## [135] "criteria_txt=(iv)(vi)"
## [136] "criteria_txt=(iv)(vii)(viii)(ix)(x)"
## [137] "criteria_txt=(ix)(x)"
## [138] "criteria_txt=(v)(vi)"
## [139] "criteria_txt=(v)(x)"
## [140] "criteria_txt=(vii)(ix)(x)"
## [141] "criteria_txt=(vii)(viii)"
## [142] "criteria_txt=(vii)(x)"
## [143] "criteria_txt=(x)"
## [144] "category_short=C"
## [145] "category_short=M"
## [146] "category_short=N"
## [147] "iso_code=af"
## [148] "iso_code=ar,br"
## [149] "iso_code=az"
## [150] "iso_code=br"
## [151] "iso_code=ca"
## [152] "iso_code=cn"
## [153] "iso_code=cz"
## [154] "iso_code=de"
## [155] "iso_code=es"
## [156] "iso_code=fi"
## [157] "iso_code=gb"
## [158] "iso_code=gr"
## [159] "iso_code=hr"
## [160] "iso_code=it"
## [161] "iso_code=it,ch"
## [162] "iso_code=jp"
## [163] "iso_code=kr"
## [164] "iso_code=la"
## [165] "iso_code=mn"
## [166] "iso_code=mx"
## [167] "iso_code=nz"
## [168] "iso_code=pe"
## [169] "iso_code=ru"
## [170] "iso_code=sa"
## [171] "iso_code=tg"
## [172] "iso_code=tj"
## [173] "iso_code=tz"
## [174] "iso_code=us"
## [175] "iso_code=ye"
## [176] "udnp_code=afg"
## [177] "udnp_code=arg,bra"
## [178] "udnp_code=aze"
## [179] "udnp_code=bra"
## [180] "udnp_code=can"
## [181] "udnp_code=chn"
## [182] "udnp_code=cze"
## [183] "udnp_code=deu"
## [184] "udnp_code=esp"
## [185] "udnp_code=fin"
## [186] "udnp_code=gbr"
## [187] "udnp_code=grc"
## [188] "udnp_code=hrv"
## [189] "udnp_code=ita"
## [190] "udnp_code=ita,che"
## [191] "udnp_code=jpn"
## [192] "udnp_code=kor"
## [193] "udnp_code=lao"
## [194] "udnp_code=mex"
## [195] "udnp_code=mng"
## [196] "udnp_code=nzl"
## [197] "udnp_code=per"
## [198] "udnp_code=rus"
## [199] "udnp_code=sau"
## [200] "udnp_code=tgo"
## [201] "udnp_code=tjk"
## [202] "udnp_code=tza"
## [203] "udnp_code=usa"
## [204] "udnp_code=yem"
## [205] "transboundary=[0,1]"
inspect(trans[1:3])
## items transactionID
## [1] {category=Cultural,
## states_name_en=Saudi Arabia,
## region_en=Arab States,
## unique_number=[1.65e+03,2.32e+03],
## id_no=[1.21e+03,1.56e+03],
## rev_bis=,
## name_en=At-Turaif District in ad-Dir'iyah,
## date_inscribed,
## secondary_dates=,
## danger_list=,
## longitude=[0.721,49.5),
## latitude=[-50.8,26.4),
## area_hectares=[0,141),
## criteria_txt=(iv)(v)(vi),
## category_short=C,
## iso_code=sa,
## udnp_code=sau,
## transboundary=[0,1]} 1
## [2] {category=Natural,
## states_name_en=Tajikistan,
## region_en=Asia and the Pacific,
## unique_number=[1.65e+03,2.32e+03],
## id_no=[1.21e+03,1.56e+03],
## rev_bis=Rev,
## name_en=Tajik National Park (Mountains of the Pamirs),
## date_inscribed,
## secondary_dates=,
## danger_list=,
## longitude=[49.5,166],
## latitude=[26.4,43.2),
## area_hectares=[3.12e+03,3.74e+06],
## criteria_txt=(vii)(viii),
## category_short=N,
## iso_code=tj,
## udnp_code=tjk,
## transboundary=[0,1]} 2
## [3] {category=Cultural,
## states_name_en=Peru,
## region_en=Latin America and the Caribbean,
## unique_number=[234,1.02e+03),
## id_no=[39,777),
## rev_bis=bis,
## name_en=Historic Centre of Lima,
## date_inscribed,
## secondary_dates=1991,
## danger_list=,
## longitude=[-98.8,0.721),
## latitude=[-50.8,26.4),
## area_hectares=[141,3.12e+03),
## criteria_txt=(iv),
## category_short=C,
## iso_code=pe,
## udnp_code=per,
## transboundary=[0,1]} 3
image(trans)
itemFrequencyPlot(trans,topN = 20)
vertical <- as(trans, "tidLists")
as(vertical, "matrix")[1:10, 1:5]
## 1 2 3 4 5
## category=Cultural TRUE FALSE TRUE TRUE TRUE
## category=Mixed FALSE FALSE FALSE FALSE FALSE
## category=Natural FALSE TRUE FALSE FALSE FALSE
## states_name_en=Afghanistan FALSE FALSE FALSE FALSE FALSE
## states_name_en=Argentina,Brazil FALSE FALSE FALSE FALSE FALSE
## states_name_en=Azerbaijan FALSE FALSE FALSE FALSE FALSE
## states_name_en=Brazil FALSE FALSE FALSE FALSE FALSE
## states_name_en=Canada FALSE FALSE FALSE FALSE FALSE
## states_name_en=China FALSE FALSE FALSE FALSE FALSE
## states_name_en=Croatia FALSE FALSE FALSE FALSE FALSE
trans
## transactions in sparse format with
## 45 transactions (rows) and
## 205 items (columns)
its <- apriori(trans, parameter=list(target = "frequent"))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## NA 0.1 1 none FALSE TRUE 5 0.1 1
## maxlen target ext
## 10 frequent itemsets TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 4
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[205 item(s), 45 transaction(s)] done [0.00s].
## sorting and recoding items ... [29 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 6 7 8 9 10
## Warning in apriori(trans, parameter = list(target = "frequent")): Mining stopped
## (maxlen reached). Only patterns up to a length of 10 returned!
## done [0.00s].
## sorting transactions ... done [0.00s].
## writing ... [11480 set(s)] done [0.00s].
## creating S4 object ... done [0.00s].
its
## set of 11480 itemsets
inspect(head(its, n = 10))
## items support
## [1] {rev_bis=Bis} 0.1111111
## [2] {criteria_txt=(iii)} 0.1111111
## [3] {category=Natural} 0.1555556
## [4] {category_short=N} 0.1555556
## [5] {region_en=Latin America and the Caribbean} 0.1777778
## [6] {region_en=Asia and the Pacific} 0.2888889
## [7] {latitude=[-50.8,26.4)} 0.3333333
## [8] {longitude=[0.721,49.5)} 0.3333333
## [9] {longitude=[-98.8,0.721)} 0.3333333
## [10] {area_hectares=[141,3.12e+03)} 0.3333333
## transIdenticalToItemsets count
## [1] 0 5
## [2] 0 5
## [3] 0 7
## [4] 0 7
## [5] 0 8
## [6] 0 13
## [7] 0 15
## [8] 0 15
## [9] 0 15
## [10] 0 15
ggplot(tibble(`Itemset Size` = factor(size(its))), aes(`Itemset Size`)) + geom_bar()
rules <- apriori(trans, parameter = list(support = 0.07, confidence = 0.5))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.5 0.1 1 none FALSE TRUE 5 0.07 1
## maxlen target ext
## 10 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 3
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[205 item(s), 45 transaction(s)] done [0.00s].
## sorting and recoding items ... [38 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 6 7 8 9 10
## Warning in apriori(trans, parameter = list(support = 0.07, confidence = 0.5)):
## Mining stopped (maxlen reached). Only patterns up to a length of 10 returned!
## done [0.00s].
## writing ... [103557 rule(s)] done [0.02s].
## creating S4 object ... done [0.02s].
inspect(head(rules))
## lhs rhs support confidence coverage lift count
## [1] {} => {rev_bis=} 0.7333333 0.7333333 1 1 33
## [2] {} => {category=Cultural} 0.7777778 0.7777778 1 1 35
## [3] {} => {category_short=C} 0.7777778 0.7777778 1 1 35
## [4] {} => {secondary_dates=} 0.8888889 0.8888889 1 1 40
## [5] {} => {danger_list=} 0.9333333 0.9333333 1 1 42
## [6] {} => {transboundary=[0,1]} 1.0000000 1.0000000 1 1 45
plot(rules,jitter = 1)
plot(rules, shading = "order")
## To reduce overplotting, jitter is added! Use jitter = 0 to prevent jitter.
plot(head(rules, n = 12), method = "graph")
tail(smalldf)
## category states_name_en
## 40 Cultural Saudi Arabia
## 41 Cultural Germany
## 42 Cultural Mexico
## 43 Cultural United Kingdom of Great Britain and Northern Ireland
## 44 Mixed United Republic of Tanzania
## 45 Cultural Republic of Korea
## region_en unique_number id_no rev_bis
## 40 Arab States 2228 1563
## 41 Europe and North America 1335 1155
## 42 Latin America and the Caribbean 477 414
## 43 Europe and North America 1262 1084
## 44 Africa 1639 39 Bis
## 45 Asia and the Pacific 2227 1562
## name_en date_inscribed secondary_dates
## 40 Al-Ahsa Oasis, an Evolving Cultural Landscape TRUE
## 41 Old town of Regensburg with Stadtamhof TRUE
## 42 Pre-Hispanic City of Teotihuacan TRUE
## 43 Royal Botanic Gardens, Kew TRUE
## 44 Ngorongoro Conservation Area TRUE 2010
## 45 Sansa, Buddhist Mountain Monasteries in Korea TRUE
## danger danger_list longitude latitude area_hectares criteria_txt
## 40 FALSE 49.6305694 25.40217 8544.00 (iii)(iv)(v)
## 41 FALSE 12.0991667 49.02056 182.80 (ii)(iii)(iv)
## 42 FALSE -98.8416700 19.69167 250.00 (i)(ii)(iii)(iv)(vi)
## 43 FALSE -0.2940278 51.48194 132.00 (ii)(iii)(iv)
## 44 FALSE P 1984-1989 35.5408300 -3.18722 809440.00 (iv)(vii)(viii)(ix)(x)
## 45 FALSE 127.8333333 36.54194 55.43 (iii)
## category_short iso_code udnp_code transboundary
## 40 C sa sau 0
## 41 C de deu 0
## 42 C mx mex 0
## 43 C gb gbr 0
## 44 M tz tza 0
## 45 C kr kor 0
I will do a simple linear regression with date_inscribed and id_no variable.
linear <- lm(date_inscribed ~ id_no, df)
linear
##
## Call:
## lm(formula = date_inscribed ~ id_no, data = df)
##
## Coefficients:
## (Intercept) id_no
## 1.978e+03 2.432e-02
summary(linear)
##
## Call:
## lm(formula = date_inscribed ~ id_no, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.660 -1.587 -1.052 0.438 33.955
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.978e+03 2.151e-01 9196.7 <2e-16 ***
## id_no 2.432e-02 2.328e-04 104.4 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.557 on 1119 degrees of freedom
## Multiple R-squared: 0.907, Adjusted R-squared: 0.9069
## F-statistic: 1.091e+04 on 1 and 1119 DF, p-value: < 2.2e-16
The adjusted R-squared is close to 1 i.e. 0.9069 and also p-value is less than 0.05, which means that our model is statistically significant. This model is excellent!
plot(linear)
multiple <- lm(date_inscribed ~ id_no + category, smalldf)
summary(multiple)
##
## Call:
## lm(formula = date_inscribed ~ id_no + category, data = smalldf)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.713e-16 -3.602e-16 -1.943e-16 -2.880e-17 8.483e-15
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.000e+00 5.298e-16 1.888e+15 <2e-16 ***
## id_no 5.014e-19 5.117e-19 9.800e-01 0.333
## categoryMixed -1.044e-16 8.324e-16 -1.250e-01 0.901
## categoryNatural -3.157e-16 5.664e-16 -5.570e-01 0.580
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.36e-15 on 41 degrees of freedom
## Multiple R-squared: 0.4988, Adjusted R-squared: 0.4622
## F-statistic: 13.6 on 3 and 41 DF, p-value: 2.659e-06
The adjusted R-squared is near to 1 i.e. 0.7955 and also p-value is less than 0.05, which means that this model is statistically significant. This model is good to use!
multiple2 <- lm(date_inscribed ~ id_no: category, smalldf)
summary(multiple2)
##
## Call:
## lm(formula = date_inscribed ~ id_no:category, data = smalldf)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.302e-16 -3.895e-16 -1.602e-16 -4.400e-18 8.441e-15
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.000e+00 5.065e-16 1.974e+15 <2e-16 ***
## id_no:categoryCultural 5.718e-19 5.143e-19 1.112e+00 0.273
## id_no:categoryMixed 2.481e-19 1.068e-18 2.320e-01 0.818
## id_no:categoryNatural 2.246e-19 6.317e-19 3.550e-01 0.724
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.356e-15 on 41 degrees of freedom
## Multiple R-squared: 0.5015, Adjusted R-squared: 0.465
## F-statistic: 13.75 on 3 and 41 DF, p-value: 2.388e-06
Same with this model which is also statistically significant. This model is good to use!
multiple3 <- lm(date_inscribed ~ id_no*category, smalldf)
summary(multiple3)
##
## Call:
## lm(formula = date_inscribed ~ id_no * category, data = smalldf)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.714e-16 -3.935e-16 -1.287e-16 0.000e+00 8.420e-15
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.000e+00 6.068e-16 1.648e+15 <2e-16 ***
## id_no 6.603e-19 5.999e-19 1.101e+00 0.278
## categoryMixed 3.606e-16 1.398e-15 2.580e-01 0.798
## categoryNatural 3.606e-16 1.751e-15 2.060e-01 0.838
## id_no:categoryMixed -6.603e-19 1.651e-18 -4.000e-01 0.691
## id_no:categoryNatural -6.603e-19 1.594e-18 -4.140e-01 0.681
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.389e-15 on 39 degrees of freedom
## Multiple R-squared: 0.5019, Adjusted R-squared: 0.438
## F-statistic: 7.859 on 5 and 39 DF, p-value: 3.404e-05
This one looks quite similar to the other multiple regression. This model is statistically significant. I will run a regression predicting date_inscribed on all of our data.
multiple4 <- lm(date_inscribed ~ region_en + states_name_en + danger + category_short , data = smalldf)
summary(multiple4)
##
## Call:
## lm(formula = date_inscribed ~ region_en + states_name_en + danger +
## category_short, data = smalldf)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.469e-15 0.000e+00 0.000e+00 0.000e+00 4.469e-15
##
## Coefficients: (5 not defined because of singularities)
## Estimate
## (Intercept) 1.000e+00
## region_enArab States 8.403e-30
## region_enAsia and the Pacific 3.322e-30
## region_enEurope and North America 4.676e-30
## region_enLatin America and the Caribbean 2.479e-30
## states_name_enArgentina,Brazil 2.376e-30
## states_name_enAzerbaijan 1.179e-30
## states_name_enBrazil 2.681e-30
## states_name_enCanada 6.280e-31
## states_name_enChina 2.018e-30
## states_name_enCroatia 8.706e-31
## states_name_enCzechia 5.081e-31
## states_name_enFinland 7.299e-31
## states_name_enGermany 3.555e-31
## states_name_enGreece 2.516e-31
## states_name_enItaly 1.098e-30
## states_name_enItaly,Switzerland -5.923e-31
## states_name_enJapan 1.866e-30
## states_name_enLao People's Democratic Republic 1.927e-30
## states_name_enMexico 2.478e-30
## states_name_enMongolia 1.998e-30
## states_name_enNew Zealand 2.093e-30
## states_name_enPeru NA
## states_name_enRepublic of Korea 1.428e-30
## states_name_enRussian Federation 2.362e-31
## states_name_enSaudi Arabia 4.469e-15
## states_name_enSpain 6.828e-31
## states_name_enTajikistan 1.427e-30
## states_name_enTogo 5.175e-30
## states_name_enUnited Kingdom of Great Britain and Northern Ireland 4.410e-31
## states_name_enUnited Republic of Tanzania NA
## states_name_enUnited States of America NA
## states_name_enYemen NA
## dangerTRUE NA
## category_shortM 2.665e-30
## category_shortN -1.722e-31
## Std. Error
## (Intercept) 2.315e-15
## region_enArab States 2.913e-15
## region_enAsia and the Pacific 2.865e-15
## region_enEurope and North America 2.865e-15
## region_enLatin America and the Caribbean 2.215e-15
## states_name_enArgentina,Brazil 2.215e-15
## states_name_enAzerbaijan 2.389e-15
## states_name_enBrazil 1.583e-15
## states_name_enCanada 2.389e-15
## states_name_enChina 2.151e-15
## states_name_enCroatia 2.389e-15
## states_name_enCzechia 2.389e-15
## states_name_enFinland 2.389e-15
## states_name_enGermany 2.069e-15
## states_name_enGreece 2.389e-15
## states_name_enItaly 2.069e-15
## states_name_enItaly,Switzerland 2.389e-15
## states_name_enJapan 1.911e-15
## states_name_enLao People's Democratic Republic 2.389e-15
## states_name_enMexico 2.215e-15
## states_name_enMongolia 2.389e-15
## states_name_enNew Zealand 2.664e-15
## states_name_enPeru NA
## states_name_enRepublic of Korea 2.069e-15
## states_name_enRussian Federation 2.389e-15
## states_name_enSaudi Arabia 2.381e-15
## states_name_enSpain 2.069e-15
## states_name_enTajikistan 2.664e-15
## states_name_enTogo 2.865e-15
## states_name_enUnited Kingdom of Great Britain and Northern Ireland 1.888e-15
## states_name_enUnited Republic of Tanzania NA
## states_name_enUnited States of America NA
## states_name_enYemen NA
## dangerTRUE NA
## category_shortM 1.583e-15
## category_shortN 1.180e-15
## t value
## (Intercept) 4.320e+14
## region_enArab States 0.000e+00
## region_enAsia and the Pacific 0.000e+00
## region_enEurope and North America 0.000e+00
## region_enLatin America and the Caribbean 0.000e+00
## states_name_enArgentina,Brazil 0.000e+00
## states_name_enAzerbaijan 0.000e+00
## states_name_enBrazil 0.000e+00
## states_name_enCanada 0.000e+00
## states_name_enChina 0.000e+00
## states_name_enCroatia 0.000e+00
## states_name_enCzechia 0.000e+00
## states_name_enFinland 0.000e+00
## states_name_enGermany 0.000e+00
## states_name_enGreece 0.000e+00
## states_name_enItaly 0.000e+00
## states_name_enItaly,Switzerland 0.000e+00
## states_name_enJapan 0.000e+00
## states_name_enLao People's Democratic Republic 0.000e+00
## states_name_enMexico 0.000e+00
## states_name_enMongolia 0.000e+00
## states_name_enNew Zealand 0.000e+00
## states_name_enPeru NA
## states_name_enRepublic of Korea 0.000e+00
## states_name_enRussian Federation 0.000e+00
## states_name_enSaudi Arabia 1.877e+00
## states_name_enSpain 0.000e+00
## states_name_enTajikistan 0.000e+00
## states_name_enTogo 0.000e+00
## states_name_enUnited Kingdom of Great Britain and Northern Ireland 0.000e+00
## states_name_enUnited Republic of Tanzania NA
## states_name_enUnited States of America NA
## states_name_enYemen NA
## dangerTRUE NA
## category_shortM 0.000e+00
## category_shortN 0.000e+00
## Pr(>|t|)
## (Intercept) <2e-16 ***
## region_enArab States 1.0000
## region_enAsia and the Pacific 1.0000
## region_enEurope and North America 1.0000
## region_enLatin America and the Caribbean 1.0000
## states_name_enArgentina,Brazil 1.0000
## states_name_enAzerbaijan 1.0000
## states_name_enBrazil 1.0000
## states_name_enCanada 1.0000
## states_name_enChina 1.0000
## states_name_enCroatia 1.0000
## states_name_enCzechia 1.0000
## states_name_enFinland 1.0000
## states_name_enGermany 1.0000
## states_name_enGreece 1.0000
## states_name_enItaly 1.0000
## states_name_enItaly,Switzerland 1.0000
## states_name_enJapan 1.0000
## states_name_enLao People's Democratic Republic 1.0000
## states_name_enMexico 1.0000
## states_name_enMongolia 1.0000
## states_name_enNew Zealand 1.0000
## states_name_enPeru NA
## states_name_enRepublic of Korea 1.0000
## states_name_enRussian Federation 1.0000
## states_name_enSaudi Arabia 0.0816 .
## states_name_enSpain 1.0000
## states_name_enTajikistan 1.0000
## states_name_enTogo 1.0000
## states_name_enUnited Kingdom of Great Britain and Northern Ireland 1.0000
## states_name_enUnited Republic of Tanzania NA
## states_name_enUnited States of America NA
## states_name_enYemen NA
## dangerTRUE NA
## category_shortM 1.0000
## category_shortN 1.0000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.689e-15 on 14 degrees of freedom
## Multiple R-squared: 0.4969, Adjusted R-squared: -0.5812
## F-statistic: 0.4609 on 30 and 14 DF, p-value: 0.9632
ggplot(df,aes(x= date_inscribed, y = id_no))+
geom_jitter(color = "Red") +
geom_smooth(method = lm)
## `geom_smooth()` using formula 'y ~ x'
Overall most points are near straight line. It seem appropriate to apply the linear regression to this data and use it.
Next, i will do on multiple to see difference.
ggplot(df,aes(x= date_inscribed, y = id_no, color = category))+
geom_point()+
geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'
#smalldfoff <- smalldf %>% filter(category == "category")
ggplot(df,aes(x= date_inscribed, y = id_no))+
geom_jitter()+
geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'
This is a really good regression! It can be used for prediction.
ggplot(df,aes(x= date_inscribed, y = id_no))+
geom_point()+
geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'
ggplot(df,aes(x= date_inscribed, y = id_no))+
geom_point()+
geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
As we can see, most of the data point are on or close to line, which makes a good regression plot.