A basic data exploration script for the “Integer Sequence Learning” Kaggle challenge. This competition challenges a machine learning algorithm capable of guessing the next number in an integer sequence.
https://www.kaggle.com/c/integer-sequence-learning
# init
rm(list=ls()); gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 311892 8.4 592000 15.9 350000 9.4
## Vcells 340274 2.6 786432 6.0 677342 5.2
# libs
library(stringr)
#library(ggplot2)
train = read.csv(unz("train.csv.zip", "train.csv"))
test = read.csv(unz("test.csv.zip", "test.csv"))
#train = read.csv("../input/train.csv")
#test = read.csv("../input/test.csv")
str(test);str(train)
## 'data.frame': 113845 obs. of 2 variables:
## $ Id : int 1 2 4 5 6 9 10 12 14 17 ...
## $ Sequence: Factor w/ 112724 levels "-1,-1,-1,-1,-1,-1,-1,-1,0,-1,-1,-1,-1,1,-1,-1,-1,-1,0,1,-1,-1,-1,-1,-1,1,0,-1,-1,-1,-1,-1,0,2,-1,-1,-1,-1,-1,-1,-1,1,2,-1,-1,-1"| __truncated__,..: 21170 37154 8655 61774 33817 16343 33595 2584 99129 22897 ...
## 'data.frame': 113845 obs. of 2 variables:
## $ Id : int 3 7 8 11 13 15 16 18 20 21 ...
## $ Sequence: Factor w/ 112880 levels "-1,-1,-1,-1,-1,-1,-1,-1,8,-1,-10,-19,-28,-37,-46,-55,-64,-73,17,8,-1,-10,-19,-28,-37,-46,-55,-64,26,17,8,-1,-10,-19,-28,-37,-46"| __truncated__,..: 56405 42851 49586 69391 39594 25563 111167 53155 99989 42592 ...
dim(test);dim(train)
## [1] 113845 2
## [1] 113845 2
# how are the Ids from test/train distributed
head(train$Id); head(test$Id)
## [1] 3 7 8 11 13 15
## [1] 1 2 4 5 6 9
# is there an overlap?
dim( dat <- merge(test, train, by="Id", all = TRUE) )
## [1] 227690 3
# explore the sequences
train$Sequence[1]
## [1] 1,3,13,87,1053,28576,2141733,508147108,402135275365,1073376057490373,9700385489355970183,298434346895322960005291,31479360095907908092817694945,11474377948948020660089085281068730
## 112880 Levels: -1,-1,-1,-1,-1,-1,-1,-1,8,-1,-10,-19,-28,-37,-46,-55,-64,-73,17,8,-1,-10,-19,-28,-37,-46,-55,-64,26,17,8,-1,-10,-19,-28,-37,-46,-55,35,26,17,8,-1,-10,-19,-28,-37,-46,44,35,26,17,8,-1,-10,-19,-28,-37,53,44,35 ...
test$Sequence[1]
## [1] 1,0,0,2,24,552,21280,103760,70299264,5792853248,587159944704
## 112724 Levels: -1,-1,-1,-1,-1,-1,-1,-1,0,-1,-1,-1,-1,1,-1,-1,-1,-1,0,1,-1,-1,-1,-1,-1,1,0,-1,-1,-1,-1,-1,0,2,-1,-1,-1,-1,-1,-1,-1,1,2,-1,-1,-1,-1,-1,-1,-1,0,2,1,0,-1,-1,-1,-1,-1,-1,-1,1,3,-1,1,-1,-1,-1,-1,-1,-1,-1,0,2,3,-3,1,-1,-1,-1,-1,-1,-1,-1,-1,1,3,2,-4,0,-1,-1 ...
# count all numbers in both sets
train_numbers_count = str_count(train$Sequence, ',') + 1
test_numbers_count = str_count( test$Sequence, ',') + 1
# is there really one number missing in the training set?
summary(train_numbers_count); summary(test_numbers_count)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 19.00 34.00 41.67 59.00 348.00
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 18.00 33.00 40.54 57.00 347.00
# ...so it appears
# interesting spikes at 50 / 100 in training set (test: 49/99)
hist(train_numbers_count, breaks=100, col=rainbow(50))
hist(test_numbers_count, breaks=100, col=rainbow(50))
# so some series are really long
# what numbers are actually involved
strsplit(as.character( train$Sequence[1:10] ), ',')
## [[1]]
## [1] "1"
## [2] "3"
## [3] "13"
## [4] "87"
## [5] "1053"
## [6] "28576"
## [7] "2141733"
## [8] "508147108"
## [9] "402135275365"
## [10] "1073376057490373"
## [11] "9700385489355970183"
## [12] "298434346895322960005291"
## [13] "31479360095907908092817694945"
## [14] "11474377948948020660089085281068730"
##
## [[2]]
## [1] "1" "2" "1" "5" "5" "1" "11" "16" "7" "1"
## [11] "23" "44" "30" "9" "1" "47" "112" "104" "48" "11"
## [21] "1" "95" "272" "320" "200" "70" "13" "1" "191" "640"
## [31] "912" "720" "340" "96" "15" "1" "383" "1472" "2464" "2352"
## [41] "1400" "532" "126" "17" "1" "767" "3328" "6400" "7168" "5152"
## [51] "2464" "784" "160" "19" "1" "1535" "7424"
##
## [[3]]
## [1] "1" "2" "4" "5" "8" "10" "16"
## [8] "20" "32" "40" "64" "80" "128" "160"
## [15] "256" "320" "512" "640" "1024" "1280" "2048"
## [22] "2560" "4096" "5120" "8192" "10240" "16384" "20480"
## [29] "32768" "40960" "65536" "81920" "131072" "163840" "262144"
## [36] "327680" "524288" "655360" "1048576" "1310720" "2097152"
##
## [[4]]
## [1] "1" "8" "25"
## [4] "83" "274" "2275"
## [7] "132224" "1060067" "3312425"
## [10] "10997342" "36304451" "301432950"
## [13] "17519415551" "140456757358" "438889687625"
## [16] "1457125820233" "4810267148324" "39939263006825"
## [19] "2321287521544174" "18610239435360217"
##
## [[5]]
## [1] "1" "111"
## [3] "12211" "1343211"
## [5] "147753211" "16252853211"
## [7] "1787813853211" "196659523853211"
## [9] "21632547623853211" "2379580238623853211"
## [11] "261753826248623853211" "28792920887348623853211"
##
## [[6]]
## [1] "1" "1" "1" "1" "1" "1" "1" "1" "1" "5" "1" "1" "1" "1"
## [15] "5" "5" "1" "1" "1" "1" "11" "5" "5" "11" "5" "1" "1" "1"
## [29] "1" "5" "23" "5" "23" "5" "5" "1" "1" "1" "1" "21" "5" "39"
## [43] "5" "5" "39" "5" "21" "5" "1" "1" "1" "1" "5" "1" "17" "1"
## [57] "17" "1" "1" "5" "1" "1" "1" "1" "31" "5" "5" "29" "1" "1"
## [71] "29" "1" "5"
##
## [[7]]
## [1] "840" "1320" "1680" "2520" "3192" "3432" "4920" "5208"
## [9] "5280" "5712" "6552" "6888" "9360" "11928" "16008" "19152"
## [17] "19992" "25200" "29568" "31080" "35880" "38280" "38640" "40920"
## [25] "41832" "45240" "47880" "48360" "48720" "51240" "51480" "53040"
## [33] "56280" "57288" "61320" "63240"
##
## [[8]]
## [1] "1" "2" "7"
## [4] "27" "113" "483"
## [7] "2138" "9681" "44374"
## [10] "205500" "961614" "4532177"
## [13] "21472917" "102258257" "489141279"
## [16] "2347573314" "11300344747" "54548339666"
## [19] "263926643851" "1279497594561" "6214413418672"
## [22] "30233348558479" "147297034473933" "718569377619361"
## [25] "3509725616656089" "17161306005034007" "83994842745043322"
##
## [[9]]
## [1] "4" "6" "8" "9" "26" "1752"
##
## [[10]]
## [1] "1" "2" "1" "3" "4" "2" "4" "8" "8" "3"
## [11] "5" "13" "19" "15" "5" "6" "19" "36" "42" "28"
## [21] "8" "7" "26" "60" "91" "89" "51" "13" "8" "34"
## [31] "92" "170" "216" "182" "92" "21" "9" "43" "133" "288"
## [41] "446" "489" "363" "164" "34" "10" "53" "184" "455" "826"
## [51] "1105" "1068" "709" "290" "55" "11" "64" "246" "682" "1414"
## [61] "2219" "2619"
#merge the train & test datasets
dat = rbind(test, train)
dat = dat[order(dat$Id),]
head(dat$Id, 20)
## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
print(object.size(dat), units="Mb")
## 47.5 Mb
# extract the actual numbers
nums = strsplit(as.character( dat$Sequence ), ',')
op = par(mfrow = c(10,10),
oma = c(0,0,0,0) + 0.1,
mar = c(0,0,1,1) + 0.1)
for (n in 1:100) {
plot(nums[[n]], log = "y", type="l", axes = F, col=n)
}
par(op)
op = par(mfrow = c(20,20),
oma = c(0,0,0,0) + 0.1,
mar = c(0,0,1,1) + 0.1)
for (n in 1:400) {
plot(nums[[n]], log = "y", type="l", axes = F, col=n)
}
par(op)
#lm
# start with one example
seq = as.data.frame( as.numeric( unlist(nums[[7]]) ) )
names(seq) <- c('number')
seq$id = seq(1, length(nums[[7]]))
plot(seq$id, seq$number, type="o")
plot(seq$id, seq$number, type="o", log="y")
models.lm.formulas = c(
'number ~ id',
'number ~ exp(id)',
'number ~ sin(id)'
)
for(formula in models.lm.formulas) {
model.lm = lm(as.formula(formula), data=seq)
r_squared = summary(model.lm)$r.squared
print(paste("R2", r_squared, "Model", formula))
}
## [1] "R2 0.278235903241236 Model number ~ id"
## [1] "R2 0.232376377523283 Model number ~ exp(id)"
## [1] "R2 0.00482291388548587 Model number ~ sin(id)"
# cheating
dim( merge(test, train, by.x="Sequence", by.y = "Sequence") )
## [1] 130 3
nrow(test)
## [1] 113845