Basic explorer

A basic data exploration script for the “Integer Sequence Learning” Kaggle challenge. This competition challenges a machine learning algorithm capable of guessing the next number in an integer sequence.

https://www.kaggle.com/c/integer-sequence-learning

# init
rm(list=ls()); gc()
##          used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 311892  8.4     592000 15.9   350000  9.4
## Vcells 340274  2.6     786432  6.0   677342  5.2
# libs
library(stringr)
#library(ggplot2)

data reading

train = read.csv(unz("train.csv.zip", "train.csv"))
test  = read.csv(unz("test.csv.zip", "test.csv"))
#train = read.csv("../input/train.csv")
#test = read.csv("../input/test.csv")


str(test);str(train)
## 'data.frame':    113845 obs. of  2 variables:
##  $ Id      : int  1 2 4 5 6 9 10 12 14 17 ...
##  $ Sequence: Factor w/ 112724 levels "-1,-1,-1,-1,-1,-1,-1,-1,0,-1,-1,-1,-1,1,-1,-1,-1,-1,0,1,-1,-1,-1,-1,-1,1,0,-1,-1,-1,-1,-1,0,2,-1,-1,-1,-1,-1,-1,-1,1,2,-1,-1,-1"| __truncated__,..: 21170 37154 8655 61774 33817 16343 33595 2584 99129 22897 ...
## 'data.frame':    113845 obs. of  2 variables:
##  $ Id      : int  3 7 8 11 13 15 16 18 20 21 ...
##  $ Sequence: Factor w/ 112880 levels "-1,-1,-1,-1,-1,-1,-1,-1,8,-1,-10,-19,-28,-37,-46,-55,-64,-73,17,8,-1,-10,-19,-28,-37,-46,-55,-64,26,17,8,-1,-10,-19,-28,-37,-46"| __truncated__,..: 56405 42851 49586 69391 39594 25563 111167 53155 99989 42592 ...
dim(test);dim(train)
## [1] 113845      2
## [1] 113845      2
# how are the Ids from test/train distributed
head(train$Id); head(test$Id)
## [1]  3  7  8 11 13 15
## [1] 1 2 4 5 6 9
# is there an overlap?
dim( dat <- merge(test, train, by="Id", all = TRUE) )
## [1] 227690      3
# explore the sequences
train$Sequence[1]
## [1] 1,3,13,87,1053,28576,2141733,508147108,402135275365,1073376057490373,9700385489355970183,298434346895322960005291,31479360095907908092817694945,11474377948948020660089085281068730
## 112880 Levels: -1,-1,-1,-1,-1,-1,-1,-1,8,-1,-10,-19,-28,-37,-46,-55,-64,-73,17,8,-1,-10,-19,-28,-37,-46,-55,-64,26,17,8,-1,-10,-19,-28,-37,-46,-55,35,26,17,8,-1,-10,-19,-28,-37,-46,44,35,26,17,8,-1,-10,-19,-28,-37,53,44,35 ...
test$Sequence[1]
## [1] 1,0,0,2,24,552,21280,103760,70299264,5792853248,587159944704
## 112724 Levels: -1,-1,-1,-1,-1,-1,-1,-1,0,-1,-1,-1,-1,1,-1,-1,-1,-1,0,1,-1,-1,-1,-1,-1,1,0,-1,-1,-1,-1,-1,0,2,-1,-1,-1,-1,-1,-1,-1,1,2,-1,-1,-1,-1,-1,-1,-1,0,2,1,0,-1,-1,-1,-1,-1,-1,-1,1,3,-1,1,-1,-1,-1,-1,-1,-1,-1,0,2,3,-3,1,-1,-1,-1,-1,-1,-1,-1,-1,1,3,2,-4,0,-1,-1 ...

Counting the numbers in both sets

is there a number really missing in the test set?

# count all numbers in both sets
train_numbers_count = str_count(train$Sequence, ',') + 1
test_numbers_count  = str_count( test$Sequence, ',') + 1

# is there really one number missing in the training set?
summary(train_numbers_count); summary(test_numbers_count)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00   19.00   34.00   41.67   59.00  348.00
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00   18.00   33.00   40.54   57.00  347.00
# ...so it appears

# interesting spikes at 50 / 100 in training set (test: 49/99)
hist(train_numbers_count, breaks=100, col=rainbow(50))

hist(test_numbers_count, breaks=100, col=rainbow(50))

# so some series are really long

What numbers are involved

# what numbers are actually involved
strsplit(as.character( train$Sequence[1:10] ), ',')
## [[1]]
##  [1] "1"                                  
##  [2] "3"                                  
##  [3] "13"                                 
##  [4] "87"                                 
##  [5] "1053"                               
##  [6] "28576"                              
##  [7] "2141733"                            
##  [8] "508147108"                          
##  [9] "402135275365"                       
## [10] "1073376057490373"                   
## [11] "9700385489355970183"                
## [12] "298434346895322960005291"           
## [13] "31479360095907908092817694945"      
## [14] "11474377948948020660089085281068730"
## 
## [[2]]
##  [1] "1"    "2"    "1"    "5"    "5"    "1"    "11"   "16"   "7"    "1"   
## [11] "23"   "44"   "30"   "9"    "1"    "47"   "112"  "104"  "48"   "11"  
## [21] "1"    "95"   "272"  "320"  "200"  "70"   "13"   "1"    "191"  "640" 
## [31] "912"  "720"  "340"  "96"   "15"   "1"    "383"  "1472" "2464" "2352"
## [41] "1400" "532"  "126"  "17"   "1"    "767"  "3328" "6400" "7168" "5152"
## [51] "2464" "784"  "160"  "19"   "1"    "1535" "7424"
## 
## [[3]]
##  [1] "1"       "2"       "4"       "5"       "8"       "10"      "16"     
##  [8] "20"      "32"      "40"      "64"      "80"      "128"     "160"    
## [15] "256"     "320"     "512"     "640"     "1024"    "1280"    "2048"   
## [22] "2560"    "4096"    "5120"    "8192"    "10240"   "16384"   "20480"  
## [29] "32768"   "40960"   "65536"   "81920"   "131072"  "163840"  "262144" 
## [36] "327680"  "524288"  "655360"  "1048576" "1310720" "2097152"
## 
## [[4]]
##  [1] "1"                 "8"                 "25"               
##  [4] "83"                "274"               "2275"             
##  [7] "132224"            "1060067"           "3312425"          
## [10] "10997342"          "36304451"          "301432950"        
## [13] "17519415551"       "140456757358"      "438889687625"     
## [16] "1457125820233"     "4810267148324"     "39939263006825"   
## [19] "2321287521544174"  "18610239435360217"
## 
## [[5]]
##  [1] "1"                       "111"                    
##  [3] "12211"                   "1343211"                
##  [5] "147753211"               "16252853211"            
##  [7] "1787813853211"           "196659523853211"        
##  [9] "21632547623853211"       "2379580238623853211"    
## [11] "261753826248623853211"   "28792920887348623853211"
## 
## [[6]]
##  [1] "1"  "1"  "1"  "1"  "1"  "1"  "1"  "1"  "1"  "5"  "1"  "1"  "1"  "1" 
## [15] "5"  "5"  "1"  "1"  "1"  "1"  "11" "5"  "5"  "11" "5"  "1"  "1"  "1" 
## [29] "1"  "5"  "23" "5"  "23" "5"  "5"  "1"  "1"  "1"  "1"  "21" "5"  "39"
## [43] "5"  "5"  "39" "5"  "21" "5"  "1"  "1"  "1"  "1"  "5"  "1"  "17" "1" 
## [57] "17" "1"  "1"  "5"  "1"  "1"  "1"  "1"  "31" "5"  "5"  "29" "1"  "1" 
## [71] "29" "1"  "5" 
## 
## [[7]]
##  [1] "840"   "1320"  "1680"  "2520"  "3192"  "3432"  "4920"  "5208" 
##  [9] "5280"  "5712"  "6552"  "6888"  "9360"  "11928" "16008" "19152"
## [17] "19992" "25200" "29568" "31080" "35880" "38280" "38640" "40920"
## [25] "41832" "45240" "47880" "48360" "48720" "51240" "51480" "53040"
## [33] "56280" "57288" "61320" "63240"
## 
## [[8]]
##  [1] "1"                 "2"                 "7"                
##  [4] "27"                "113"               "483"              
##  [7] "2138"              "9681"              "44374"            
## [10] "205500"            "961614"            "4532177"          
## [13] "21472917"          "102258257"         "489141279"        
## [16] "2347573314"        "11300344747"       "54548339666"      
## [19] "263926643851"      "1279497594561"     "6214413418672"    
## [22] "30233348558479"    "147297034473933"   "718569377619361"  
## [25] "3509725616656089"  "17161306005034007" "83994842745043322"
## 
## [[9]]
## [1] "4"    "6"    "8"    "9"    "26"   "1752"
## 
## [[10]]
##  [1] "1"    "2"    "1"    "3"    "4"    "2"    "4"    "8"    "8"    "3"   
## [11] "5"    "13"   "19"   "15"   "5"    "6"    "19"   "36"   "42"   "28"  
## [21] "8"    "7"    "26"   "60"   "91"   "89"   "51"   "13"   "8"    "34"  
## [31] "92"   "170"  "216"  "182"  "92"   "21"   "9"    "43"   "133"  "288" 
## [41] "446"  "489"  "363"  "164"  "34"   "10"   "53"   "184"  "455"  "826" 
## [51] "1105" "1068" "709"  "290"  "55"   "11"   "64"   "246"  "682"  "1414"
## [61] "2219" "2619"
#merge the train & test datasets
dat = rbind(test, train)
dat = dat[order(dat$Id),]
head(dat$Id, 20)
##  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
print(object.size(dat), units="Mb")
## 47.5 Mb
# extract the actual numbers
nums = strsplit(as.character( dat$Sequence ), ',')

Plot the first 100 train set numbers, log scale

op = par(mfrow = c(10,10),
          oma = c(0,0,0,0) + 0.1,
          mar = c(0,0,1,1) + 0.1)
for (n in 1:100) {
  plot(nums[[n]], log = "y", type="l", axes = F, col=n)
}

par(op)

Plot the first 400 train set numbers, log scale

op = par(mfrow = c(20,20),
          oma = c(0,0,0,0) + 0.1,
          mar = c(0,0,1,1) + 0.1)
for (n in 1:400) {
  plot(nums[[n]], log = "y", type="l", axes = F, col=n)
}

par(op)

let the fun begin

#lm

# start with one example
seq = as.data.frame( as.numeric( unlist(nums[[7]]) ) )
names(seq) <- c('number')
seq$id = seq(1, length(nums[[7]]))

plot(seq$id, seq$number, type="o")

plot(seq$id, seq$number, type="o", log="y")

models.lm.formulas = c(
    'number ~ id', 
    'number ~ exp(id)',
    'number ~ sin(id)'
)

for(formula in models.lm.formulas) {
  model.lm = lm(as.formula(formula), data=seq)
  r_squared = summary(model.lm)$r.squared
  print(paste("R2", r_squared, "Model", formula))
}
## [1] "R2 0.278235903241236 Model number ~ id"
## [1] "R2 0.232376377523283 Model number ~ exp(id)"
## [1] "R2 0.00482291388548587 Model number ~ sin(id)"

Cheating

# cheating
dim( merge(test, train, by.x="Sequence", by.y = "Sequence") )
## [1] 130   3
nrow(test)
## [1] 113845