Information Retrieval • quanteda.proximity

library(quanteda, quietly = TRUE)
#> Package version: 3.3.1
#> Unicode version: 14.0
#> ICU version: 70.1
#> Parallel computing: 4 of 4 threads used.
#> See https://quanteda.io for tutorials and examples.
library(quanteda.proximity)

This is a simple example of using quanteda.proximity to solve an information retrieval problem. See this question on Stack Overflow. The puzzle is to extract words that are within a 15-word window of “fire” and their proximity values with “fire”.

txt1 <- "Far over the misty mountains cold To dungeons deep and caverns old We 
must away ere break of day To seek the pale enchanted gold. The dwarves of 
yore made mighty spells, While hammers fell like ringing bells In places deep, 
where dark things sleep, In hollow halls beneath the fells. For ancient king 
and elvish lord There many a gleaming golden hoard They shaped and wrought, 
and light they caught To hide in gems on hilt of sword. On silver necklaces 
they strung The flowering stars, on crowns they hung The dragon-fire, in 
twisted wire They meshed the light of moon and sun. Far over the misty 
mountains cold To dungeons deep and caverns old We must away, ere break of 
day, To claim our long-forgotten gold. Goblets they carved there for 
themselves And harps of gold; where no man delves There lay they long, and 
many a song Was sung unheard by men or elves. The pines were roaring on the 
height, The winds were moaning in the night. The fire was red, it flaming 
spread; The trees like torches blazed with light. The bells were ringing in 
the dale And men they looked up with faces pale; The dragon’s ire more fierce 
than fire Laid low their towers and houses frail. The mountain smoked beneath 
the moon; The dwarves they heard the tramp of doom. They fled their hall to 
dying fall Beneath his feet, beneath the moon. Far over the misty mountains 
grim To dungeons deep and caverns dim We must away, ere break of day,
To win our harps and gold from him!"

tok1 <- tokens(txt1,  remove_punct = TRUE, remove_numbers = TRUE, remove_symbols = TRUE, split_hyphens = TRUE) %>%
    tokens_tolower() %>% tokens_proximity("fire", count_from = 0)

dat1 <- convert(tok1)
dat1[dat1$proximity <= 15 & dat1$proximity > 0,]
#>     doc_id token proximity
#> 79   text1    65        15
#> 80   text1    63        14
#> 81   text1    66        13
#> 82   text1    67        12
#> 83   text1    56        11
#> 84   text1    68        10
#> 85   text1     3         9
#> 86   text1    69         8
#> 87   text1    70         7
#> 88   text1    63         6
#> 89   text1    71         5
#> 90   text1    56         4
#> 91   text1    72         3
#> 92   text1     3         2
#> 93   text1    73         1
#> 95   text1    35         1
#> 96   text1    75         2
#> 97   text1    76         3
#> 98   text1    56         4
#> 99   text1    77         5
#> 100  text1     3         6
#> 101  text1    59         7
#> 102  text1    18         8
#> 103  text1    78         9
#> 104  text1    10        10
#> 105  text1    79        11
#> 106  text1     1        12
#> 107  text1     2        13
#> 108  text1     3        14
#> 109  text1     4        15
#> 160  text1     3        15
#> 161  text1   100        14
#> 162  text1   101        13
#> 163  text1   102        12
#> 164  text1    63        11
#> 165  text1     3        10
#> 166  text1   103         9
#> 167  text1     3         8
#> 168  text1   104         7
#> 169  text1   101         6
#> 170  text1   105         5
#> 171  text1    35         4
#> 172  text1     3         3
#> 173  text1   106         2
#> 174  text1     3         1
#> 176  text1    93         1
#> 177  text1   107         2
#> 178  text1   108         3
#> 179  text1   109         4
#> 180  text1   110         5
#> 181  text1     3         6
#> 182  text1   111         7
#> 183  text1    32         8
#> 184  text1   112         9
#> 185  text1   113        10
#> 186  text1   114        11
#> 187  text1    59        12
#> 188  text1     3        13
#> 189  text1    34        14
#> 190  text1   101        15
#> 194  text1   115        15
#> 195  text1    10        14
#> 196  text1    97        13
#> 197  text1    56        12
#> 198  text1   116        11
#> 199  text1   117        10
#> 200  text1   114         9
#> 201  text1   118         8
#> 202  text1    21         7
#> 203  text1     3         6
#> 204  text1   119         5
#> 205  text1   120         4
#> 206  text1   121         3
#> 207  text1   122         2
#> 208  text1   123         1
#> 210  text1   124         1
#> 211  text1   125         2
#> 212  text1   126         3
#> 213  text1   127         4
#> 214  text1    10         5
#> 215  text1   128         6
#> 216  text1   129         7
#> 217  text1     3         8
#> 218  text1   130         9
#> 219  text1   131        10
#> 220  text1    43        11
#> 221  text1     3        12
#> 222  text1    78        13
#> 223  text1     3        14
#> 224  text1    24        15