library(quanteda, quietly = TRUE)
#> Package version: 3.3.1
#> Unicode version: 14.0
#> ICU version: 70.1
#> Parallel computing: 4 of 4 threads used.
#> See https://quanteda.io for tutorials and examples.
library(quanteda.proximity)
This is a simple example of using quanteda.proximity
to
solve an information retrieval problem. See this
question on Stack Overflow. The puzzle is to extract words that are
within a 15-word window of “fire” and their proximity values with
“fire”.
txt1 <- "Far over the misty mountains cold To dungeons deep and caverns old We
must away ere break of day To seek the pale enchanted gold. The dwarves of
yore made mighty spells, While hammers fell like ringing bells In places deep,
where dark things sleep, In hollow halls beneath the fells. For ancient king
and elvish lord There many a gleaming golden hoard They shaped and wrought,
and light they caught To hide in gems on hilt of sword. On silver necklaces
they strung The flowering stars, on crowns they hung The dragon-fire, in
twisted wire They meshed the light of moon and sun. Far over the misty
mountains cold To dungeons deep and caverns old We must away, ere break of
day, To claim our long-forgotten gold. Goblets they carved there for
themselves And harps of gold; where no man delves There lay they long, and
many a song Was sung unheard by men or elves. The pines were roaring on the
height, The winds were moaning in the night. The fire was red, it flaming
spread; The trees like torches blazed with light. The bells were ringing in
the dale And men they looked up with faces pale; The dragon’s ire more fierce
than fire Laid low their towers and houses frail. The mountain smoked beneath
the moon; The dwarves they heard the tramp of doom. They fled their hall to
dying fall Beneath his feet, beneath the moon. Far over the misty mountains
grim To dungeons deep and caverns dim We must away, ere break of day,
To win our harps and gold from him!"
tok1 <- tokens(txt1, remove_punct = TRUE, remove_numbers = TRUE, remove_symbols = TRUE, split_hyphens = TRUE) %>%
tokens_tolower() %>% tokens_proximity("fire", count_from = 0)
dat1 <- convert(tok1)
dat1[dat1$proximity <= 15 & dat1$proximity > 0,]
#> doc_id token proximity
#> 79 text1 65 15
#> 80 text1 63 14
#> 81 text1 66 13
#> 82 text1 67 12
#> 83 text1 56 11
#> 84 text1 68 10
#> 85 text1 3 9
#> 86 text1 69 8
#> 87 text1 70 7
#> 88 text1 63 6
#> 89 text1 71 5
#> 90 text1 56 4
#> 91 text1 72 3
#> 92 text1 3 2
#> 93 text1 73 1
#> 95 text1 35 1
#> 96 text1 75 2
#> 97 text1 76 3
#> 98 text1 56 4
#> 99 text1 77 5
#> 100 text1 3 6
#> 101 text1 59 7
#> 102 text1 18 8
#> 103 text1 78 9
#> 104 text1 10 10
#> 105 text1 79 11
#> 106 text1 1 12
#> 107 text1 2 13
#> 108 text1 3 14
#> 109 text1 4 15
#> 160 text1 3 15
#> 161 text1 100 14
#> 162 text1 101 13
#> 163 text1 102 12
#> 164 text1 63 11
#> 165 text1 3 10
#> 166 text1 103 9
#> 167 text1 3 8
#> 168 text1 104 7
#> 169 text1 101 6
#> 170 text1 105 5
#> 171 text1 35 4
#> 172 text1 3 3
#> 173 text1 106 2
#> 174 text1 3 1
#> 176 text1 93 1
#> 177 text1 107 2
#> 178 text1 108 3
#> 179 text1 109 4
#> 180 text1 110 5
#> 181 text1 3 6
#> 182 text1 111 7
#> 183 text1 32 8
#> 184 text1 112 9
#> 185 text1 113 10
#> 186 text1 114 11
#> 187 text1 59 12
#> 188 text1 3 13
#> 189 text1 34 14
#> 190 text1 101 15
#> 194 text1 115 15
#> 195 text1 10 14
#> 196 text1 97 13
#> 197 text1 56 12
#> 198 text1 116 11
#> 199 text1 117 10
#> 200 text1 114 9
#> 201 text1 118 8
#> 202 text1 21 7
#> 203 text1 3 6
#> 204 text1 119 5
#> 205 text1 120 4
#> 206 text1 121 3
#> 207 text1 122 2
#> 208 text1 123 1
#> 210 text1 124 1
#> 211 text1 125 2
#> 212 text1 126 3
#> 213 text1 127 4
#> 214 text1 10 5
#> 215 text1 128 6
#> 216 text1 129 7
#> 217 text1 3 8
#> 218 text1 130 9
#> 219 text1 131 10
#> 220 text1 43 11
#> 221 text1 3 12
#> 222 text1 78 13
#> 223 text1 3 14
#> 224 text1 24 15