Proximity-weighted text analysis • quanteda.proximity

suppressPackageStartupMessages(library(quanteda))
library(quanteda.proximity)

data_corpus_inaugural is a built-in dataset of quanteda. This is the usual way to conduct a dictionary-based analysis using the Lexicoder dictionary (also built-in). The “Net Tone” measure is calculated (Young & Soroka, 2012).

data_corpus_inaugural %>% tokens %>% tokens_tolower %>% tokens_compound(data_dictionary_LSD2015, concatenator = " ") -> tok1

tok1 %>% dfm %>% dfm_lookup(data_dictionary_LSD2015) %>%
        convert("data.frame") -> dat1
data_corpus_inaugural %>% tokens %>% tokens_tolower %>% tokens_compound(data_dictionary_LSD2015, concatenator = " ") %>%
    ntoken() -> token_count

dat1$total_count <- token_count

dat1$nettone <- ((dat1$positive + dat1$neg_negative) / dat1$total_count) -
    ((dat1$negative + dat1$neg_positive) / dat1$total_count)
dat1[order(dat1$nettone, decreasing = TRUE),c("doc_id", "nettone")]
#>             doc_id      nettone
#> 16     1849-Taylor  0.092687075
#> 48     1977-Carter  0.081871345
#> 41     1949-Truman  0.080335731
#> 3       1797-Adams  0.072317263
#> 25  1885-Cleveland  0.067217631
#> 6     1809-Madison  0.066825776
#> 11    1829-Jackson  0.065397351
#> 47      1973-Nixon  0.061654135
#> 42 1953-Eisenhower  0.061269147
#> 17     1853-Pierce  0.060864776
#> 8      1817-Monroe  0.060185185
#> 28   1897-McKinley  0.058241252
#> 23      1877-Hayes  0.057280118
#> 55       2005-Bush  0.054592721
#> 4   1801-Jefferson  0.053617907
#> 51       1989-Bush  0.053558052
#> 21      1869-Grant  0.052888527
#> 40  1945-Roosevelt  0.052380952
#> 1  1789-Washington  0.051465798
#> 35   1925-Coolidge  0.050767148
#> 43 1957-Eisenhower  0.050393701
#> 54       2001-Bush  0.049944506
#> 36     1929-Hoover  0.047754996
#> 2  1793-Washington  0.047619048
#> 50     1985-Reagan  0.047504303
#> 58      2017-Trump  0.047016275
#> 10      1825-Adams  0.046452434
#> 34    1921-Harding  0.046079224
#> 12    1833-Jackson  0.045777427
#> 27  1893-Cleveland  0.044811321
#> 29   1901-McKinley  0.044800658
#> 26   1889-Harrison  0.043533659
#> 46      1969-Nixon  0.043117745
#> 18   1857-Buchanan  0.042829332
#> 15       1845-Polk  0.042068699
#> 57      2013-Obama  0.041918755
#> 33     1917-Wilson  0.041818182
#> 37  1933-Roosevelt  0.041402825
#> 9      1821-Monroe  0.040975210
#> 22      1873-Grant  0.040108770
#> 13   1837-VanBuren  0.040000000
#> 30  1905-Roosevelt  0.039888683
#> 39  1941-Roosevelt  0.036303630
#> 32     1913-Wilson  0.036247335
#> 49     1981-Reagan  0.035353535
#> 38  1937-Roosevelt  0.034743202
#> 24   1881-Garfield  0.033697348
#> 31       1909-Taft  0.032141629
#> 53    1997-Clinton  0.031622177
#> 14   1841-Harrison  0.029844196
#> 56      2009-Obama  0.027943368
#> 5   1805-Jefferson  0.027356902
#> 52    1993-Clinton  0.026186579
#> 59      2021-Biden  0.023559261
#> 44    1961-Kennedy  0.022742040
#> 45    1965-Johnson  0.020491803
#> 7     1813-Madison  0.015396459
#> 19    1861-Lincoln  0.013770656
#> 20    1865-Lincoln -0.007741935

The Net Tone measures the overall tone of the entire article. That might not be very interesting, because we don’t know what words exactly are associated with the so-called positive and negative tones.

Alternative I - Window

An alternative is to consider words that are within a certain window of some keywords you are interested in. It can be done purely with quanteda. Suppose you are interested in all “ameri*” keywords and a window of 20.

tok1 %>% tokens_select(c("ameri*"), window = 20) %>% dfm %>% dfm_lookup(data_dictionary_LSD2015) %>%
        convert("data.frame") -> dat2
dat2$total_count <- token_count
dat2$nettone <- ((dat2$positive + dat2$neg_negative) / dat2$total_count) -
    ((dat2$negative + dat2$neg_positive) / dat2$total_count)
dat2[order(dat2$nettone, decreasing = TRUE),c("doc_id", "nettone")]
#>             doc_id       nettone
#> 2  1793-Washington  0.0272108844
#> 50     1985-Reagan  0.0258175559
#> 47      1973-Nixon  0.0245614035
#> 58      2017-Trump  0.0241109102
#> 55       2005-Bush  0.0225303293
#> 54       2001-Bush  0.0216426193
#> 59      2021-Biden  0.0199347590
#> 52    1993-Clinton  0.0158210584
#> 53    1997-Clinton  0.0147843943
#> 57      2013-Obama  0.0133967156
#> 48     1977-Carter  0.0124269006
#> 51       1989-Bush  0.0112359551
#> 34    1921-Harding  0.0091619510
#> 28   1897-McKinley  0.0078268877
#> 49     1981-Reagan  0.0075757576
#> 43 1957-Eisenhower  0.0073490814
#> 41     1949-Truman  0.0071942446
#> 38  1937-Roosevelt  0.0070493454
#> 3       1797-Adams  0.0066096423
#> 16     1849-Taylor  0.0059523810
#> 36     1929-Hoover  0.0057098365
#> 35   1925-Coolidge  0.0056407942
#> 39  1941-Roosevelt  0.0052805281
#> 56      2009-Obama  0.0052160954
#> 27  1893-Cleveland  0.0051886792
#> 40  1945-Roosevelt  0.0047619048
#> 42 1953-Eisenhower  0.0047410649
#> 12    1833-Jackson  0.0047355959
#> 33     1917-Wilson  0.0042424242
#> 29   1901-McKinley  0.0041101521
#> 1  1789-Washington  0.0039087948
#> 46      1969-Nixon  0.0037313433
#> 18   1857-Buchanan  0.0035691110
#> 45    1965-Johnson  0.0029274005
#> 31       1909-Taft  0.0027500859
#> 26   1889-Harrison  0.0021235931
#> 37  1933-Roosevelt  0.0019483682
#> 13   1837-VanBuren  0.0019277108
#> 23      1877-Hayes  0.0014781966
#> 17     1853-Pierce  0.0013770311
#> 8      1817-Monroe  0.0013616558
#> 5   1805-Jefferson  0.0012626263
#> 19    1861-Lincoln  0.0012518778
#> 24   1881-Garfield  0.0012480499
#> 14   1841-Harrison  0.0005486065
#> 9      1821-Monroe  0.0004097521
#> 4   1801-Jefferson  0.0000000000
#> 6     1809-Madison  0.0000000000
#> 10      1825-Adams  0.0000000000
#> 11    1829-Jackson  0.0000000000
#> 15       1845-Polk  0.0000000000
#> 21      1869-Grant  0.0000000000
#> 22      1873-Grant  0.0000000000
#> 30  1905-Roosevelt  0.0000000000
#> 32     1913-Wilson  0.0000000000
#> 44    1961-Kennedy -0.0006497726
#> 7     1813-Madison -0.0007698229
#> 25  1885-Cleveland -0.0016528926
#> 20    1865-Lincoln -0.0051612903

Alternative II - Proximity

We can weight the frequency by the proximity to “ameri*” keywords.

tok1 %>% tokens_proximity(c("ameri*")) %>% dfm %>% dfm_lookup(data_dictionary_LSD2015) %>%
        convert("data.frame") -> dat3
dat3$total_count <- token_count
dat3$nettone <- ((dat3$positive + dat3$neg_negative) / dat3$total_count) -
    ((dat3$negative + dat3$neg_positive) / dat3$total_count)
dat3[order(dat3$nettone, decreasing = TRUE),c("doc_id", "nettone")]
#>             doc_id       nettone
#> 58      2017-Trump  6.356202e-03
#> 2  1793-Washington  5.378879e-03
#> 55       2005-Bush  4.929468e-03
#> 50     1985-Reagan  4.743943e-03
#> 52    1993-Clinton  4.025636e-03
#> 54       2001-Bush  4.019771e-03
#> 47      1973-Nixon  3.663865e-03
#> 59      2021-Biden  2.880368e-03
#> 53    1997-Clinton  2.781912e-03
#> 57      2013-Obama  2.274827e-03
#> 48     1977-Carter  2.106950e-03
#> 27  1893-Cleveland  2.042739e-03
#> 34    1921-Harding  2.027470e-03
#> 51       1989-Bush  1.937617e-03
#> 43 1957-Eisenhower  1.570252e-03
#> 3       1797-Adams  1.554738e-03
#> 49     1981-Reagan  1.532393e-03
#> 28   1897-McKinley  1.454300e-03
#> 38  1937-Roosevelt  1.382690e-03
#> 41     1949-Truman  1.353336e-03
#> 40  1945-Roosevelt  1.229167e-03
#> 36     1929-Hoover  1.227361e-03
#> 46      1969-Nixon  1.180190e-03
#> 42 1953-Eisenhower  1.173807e-03
#> 33     1917-Wilson  1.164666e-03
#> 39  1941-Roosevelt  1.144202e-03
#> 29   1901-McKinley  1.136817e-03
#> 16     1849-Taylor  1.065152e-03
#> 56      2009-Obama  9.973793e-04
#> 35   1925-Coolidge  9.895372e-04
#> 1  1789-Washington  9.225379e-04
#> 18   1857-Buchanan  8.459756e-04
#> 12    1833-Jackson  8.217785e-04
#> 37  1933-Roosevelt  6.967383e-04
#> 25  1885-Cleveland  5.221836e-04
#> 26   1889-Harrison  4.981511e-04
#> 45    1965-Johnson  4.495909e-04
#> 31       1909-Taft  4.250471e-04
#> 24   1881-Garfield  4.064260e-04
#> 17     1853-Pierce  3.996896e-04
#> 8      1817-Monroe  3.485794e-04
#> 13   1837-VanBuren  3.231174e-04
#> 5   1805-Jefferson  2.959727e-04
#> 23      1877-Hayes  2.813624e-04
#> 9      1821-Monroe  2.012954e-04
#> 14   1841-Harrison  1.927036e-04
#> 19    1861-Lincoln  1.860261e-04
#> 11    1829-Jackson  5.409210e-05
#> 6     1809-Madison  5.312065e-05
#> 21      1869-Grant  4.299880e-05
#> 30  1905-Roosevelt  3.696820e-05
#> 4   1801-Jefferson  2.789693e-05
#> 22      1873-Grant  2.724781e-05
#> 32     1913-Wilson  1.931131e-05
#> 10      1825-Adams  1.477495e-05
#> 15       1845-Polk  8.116670e-06
#> 44    1961-Kennedy  4.907378e-06
#> 7     1813-Madison -4.665074e-05
#> 20    1865-Lincoln -1.122103e-03