data_corpus_inaugural
is a built-in dataset of
quanteda
. This is the usual way to conduct a
dictionary-based analysis using the Lexicoder dictionary (also
built-in). The “Net Tone” measure is calculated (Young & Soroka,
2012).
data_corpus_inaugural %>% tokens %>% tokens_tolower %>% tokens_compound(data_dictionary_LSD2015, concatenator = " ") -> tok1
tok1 %>% dfm %>% dfm_lookup(data_dictionary_LSD2015) %>%
convert("data.frame") -> dat1
data_corpus_inaugural %>% tokens %>% tokens_tolower %>% tokens_compound(data_dictionary_LSD2015, concatenator = " ") %>%
ntoken() -> token_count
dat1$total_count <- token_count
dat1$nettone <- ((dat1$positive + dat1$neg_negative) / dat1$total_count) -
((dat1$negative + dat1$neg_positive) / dat1$total_count)
dat1[order(dat1$nettone, decreasing = TRUE),c("doc_id", "nettone")]
#> doc_id nettone
#> 16 1849-Taylor 0.092687075
#> 48 1977-Carter 0.081871345
#> 41 1949-Truman 0.080335731
#> 3 1797-Adams 0.072317263
#> 25 1885-Cleveland 0.067217631
#> 6 1809-Madison 0.066825776
#> 11 1829-Jackson 0.065397351
#> 47 1973-Nixon 0.061654135
#> 42 1953-Eisenhower 0.061269147
#> 17 1853-Pierce 0.060864776
#> 8 1817-Monroe 0.060185185
#> 28 1897-McKinley 0.058241252
#> 23 1877-Hayes 0.057280118
#> 55 2005-Bush 0.054592721
#> 4 1801-Jefferson 0.053617907
#> 51 1989-Bush 0.053558052
#> 21 1869-Grant 0.052888527
#> 40 1945-Roosevelt 0.052380952
#> 1 1789-Washington 0.051465798
#> 35 1925-Coolidge 0.050767148
#> 43 1957-Eisenhower 0.050393701
#> 54 2001-Bush 0.049944506
#> 36 1929-Hoover 0.047754996
#> 2 1793-Washington 0.047619048
#> 50 1985-Reagan 0.047504303
#> 58 2017-Trump 0.047016275
#> 10 1825-Adams 0.046452434
#> 34 1921-Harding 0.046079224
#> 12 1833-Jackson 0.045777427
#> 27 1893-Cleveland 0.044811321
#> 29 1901-McKinley 0.044800658
#> 26 1889-Harrison 0.043533659
#> 46 1969-Nixon 0.043117745
#> 18 1857-Buchanan 0.042829332
#> 15 1845-Polk 0.042068699
#> 57 2013-Obama 0.041918755
#> 33 1917-Wilson 0.041818182
#> 37 1933-Roosevelt 0.041402825
#> 9 1821-Monroe 0.040975210
#> 22 1873-Grant 0.040108770
#> 13 1837-VanBuren 0.040000000
#> 30 1905-Roosevelt 0.039888683
#> 39 1941-Roosevelt 0.036303630
#> 32 1913-Wilson 0.036247335
#> 49 1981-Reagan 0.035353535
#> 38 1937-Roosevelt 0.034743202
#> 24 1881-Garfield 0.033697348
#> 31 1909-Taft 0.032141629
#> 53 1997-Clinton 0.031622177
#> 14 1841-Harrison 0.029844196
#> 56 2009-Obama 0.027943368
#> 5 1805-Jefferson 0.027356902
#> 52 1993-Clinton 0.026186579
#> 59 2021-Biden 0.023559261
#> 44 1961-Kennedy 0.022742040
#> 45 1965-Johnson 0.020491803
#> 7 1813-Madison 0.015396459
#> 19 1861-Lincoln 0.013770656
#> 20 1865-Lincoln -0.007741935
The Net Tone measures the overall tone of the entire article. That might not be very interesting, because we don’t know what words exactly are associated with the so-called positive and negative tones.
Alternative I - Window
An alternative is to consider words that are within a certain window
of some keywords you are interested in. It can be done purely with
quanteda
. Suppose you are interested in all “ameri*”
keywords and a window of 20.
tok1 %>% tokens_select(c("ameri*"), window = 20) %>% dfm %>% dfm_lookup(data_dictionary_LSD2015) %>%
convert("data.frame") -> dat2
dat2$total_count <- token_count
dat2$nettone <- ((dat2$positive + dat2$neg_negative) / dat2$total_count) -
((dat2$negative + dat2$neg_positive) / dat2$total_count)
dat2[order(dat2$nettone, decreasing = TRUE),c("doc_id", "nettone")]
#> doc_id nettone
#> 2 1793-Washington 0.0272108844
#> 50 1985-Reagan 0.0258175559
#> 47 1973-Nixon 0.0245614035
#> 58 2017-Trump 0.0241109102
#> 55 2005-Bush 0.0225303293
#> 54 2001-Bush 0.0216426193
#> 59 2021-Biden 0.0199347590
#> 52 1993-Clinton 0.0158210584
#> 53 1997-Clinton 0.0147843943
#> 57 2013-Obama 0.0133967156
#> 48 1977-Carter 0.0124269006
#> 51 1989-Bush 0.0112359551
#> 34 1921-Harding 0.0091619510
#> 28 1897-McKinley 0.0078268877
#> 49 1981-Reagan 0.0075757576
#> 43 1957-Eisenhower 0.0073490814
#> 41 1949-Truman 0.0071942446
#> 38 1937-Roosevelt 0.0070493454
#> 3 1797-Adams 0.0066096423
#> 16 1849-Taylor 0.0059523810
#> 36 1929-Hoover 0.0057098365
#> 35 1925-Coolidge 0.0056407942
#> 39 1941-Roosevelt 0.0052805281
#> 56 2009-Obama 0.0052160954
#> 27 1893-Cleveland 0.0051886792
#> 40 1945-Roosevelt 0.0047619048
#> 42 1953-Eisenhower 0.0047410649
#> 12 1833-Jackson 0.0047355959
#> 33 1917-Wilson 0.0042424242
#> 29 1901-McKinley 0.0041101521
#> 1 1789-Washington 0.0039087948
#> 46 1969-Nixon 0.0037313433
#> 18 1857-Buchanan 0.0035691110
#> 45 1965-Johnson 0.0029274005
#> 31 1909-Taft 0.0027500859
#> 26 1889-Harrison 0.0021235931
#> 37 1933-Roosevelt 0.0019483682
#> 13 1837-VanBuren 0.0019277108
#> 23 1877-Hayes 0.0014781966
#> 17 1853-Pierce 0.0013770311
#> 8 1817-Monroe 0.0013616558
#> 5 1805-Jefferson 0.0012626263
#> 19 1861-Lincoln 0.0012518778
#> 24 1881-Garfield 0.0012480499
#> 14 1841-Harrison 0.0005486065
#> 9 1821-Monroe 0.0004097521
#> 4 1801-Jefferson 0.0000000000
#> 6 1809-Madison 0.0000000000
#> 10 1825-Adams 0.0000000000
#> 11 1829-Jackson 0.0000000000
#> 15 1845-Polk 0.0000000000
#> 21 1869-Grant 0.0000000000
#> 22 1873-Grant 0.0000000000
#> 30 1905-Roosevelt 0.0000000000
#> 32 1913-Wilson 0.0000000000
#> 44 1961-Kennedy -0.0006497726
#> 7 1813-Madison -0.0007698229
#> 25 1885-Cleveland -0.0016528926
#> 20 1865-Lincoln -0.0051612903
Alternative II - Proximity
We can weight the frequency by the proximity to “ameri*” keywords.
tok1 %>% tokens_proximity(c("ameri*")) %>% dfm %>% dfm_lookup(data_dictionary_LSD2015) %>%
convert("data.frame") -> dat3
dat3$total_count <- token_count
dat3$nettone <- ((dat3$positive + dat3$neg_negative) / dat3$total_count) -
((dat3$negative + dat3$neg_positive) / dat3$total_count)
dat3[order(dat3$nettone, decreasing = TRUE),c("doc_id", "nettone")]
#> doc_id nettone
#> 58 2017-Trump 6.356202e-03
#> 2 1793-Washington 5.378879e-03
#> 55 2005-Bush 4.929468e-03
#> 50 1985-Reagan 4.743943e-03
#> 52 1993-Clinton 4.025636e-03
#> 54 2001-Bush 4.019771e-03
#> 47 1973-Nixon 3.663865e-03
#> 59 2021-Biden 2.880368e-03
#> 53 1997-Clinton 2.781912e-03
#> 57 2013-Obama 2.274827e-03
#> 48 1977-Carter 2.106950e-03
#> 27 1893-Cleveland 2.042739e-03
#> 34 1921-Harding 2.027470e-03
#> 51 1989-Bush 1.937617e-03
#> 43 1957-Eisenhower 1.570252e-03
#> 3 1797-Adams 1.554738e-03
#> 49 1981-Reagan 1.532393e-03
#> 28 1897-McKinley 1.454300e-03
#> 38 1937-Roosevelt 1.382690e-03
#> 41 1949-Truman 1.353336e-03
#> 40 1945-Roosevelt 1.229167e-03
#> 36 1929-Hoover 1.227361e-03
#> 46 1969-Nixon 1.180190e-03
#> 42 1953-Eisenhower 1.173807e-03
#> 33 1917-Wilson 1.164666e-03
#> 39 1941-Roosevelt 1.144202e-03
#> 29 1901-McKinley 1.136817e-03
#> 16 1849-Taylor 1.065152e-03
#> 56 2009-Obama 9.973793e-04
#> 35 1925-Coolidge 9.895372e-04
#> 1 1789-Washington 9.225379e-04
#> 18 1857-Buchanan 8.459756e-04
#> 12 1833-Jackson 8.217785e-04
#> 37 1933-Roosevelt 6.967383e-04
#> 25 1885-Cleveland 5.221836e-04
#> 26 1889-Harrison 4.981511e-04
#> 45 1965-Johnson 4.495909e-04
#> 31 1909-Taft 4.250471e-04
#> 24 1881-Garfield 4.064260e-04
#> 17 1853-Pierce 3.996896e-04
#> 8 1817-Monroe 3.485794e-04
#> 13 1837-VanBuren 3.231174e-04
#> 5 1805-Jefferson 2.959727e-04
#> 23 1877-Hayes 2.813624e-04
#> 9 1821-Monroe 2.012954e-04
#> 14 1841-Harrison 1.927036e-04
#> 19 1861-Lincoln 1.860261e-04
#> 11 1829-Jackson 5.409210e-05
#> 6 1809-Madison 5.312065e-05
#> 21 1869-Grant 4.299880e-05
#> 30 1905-Roosevelt 3.696820e-05
#> 4 1801-Jefferson 2.789693e-05
#> 22 1873-Grant 2.724781e-05
#> 32 1913-Wilson 1.931131e-05
#> 10 1825-Adams 1.477495e-05
#> 15 1845-Polk 8.116670e-06
#> 44 1961-Kennedy 4.907378e-06
#> 7 1813-Madison -4.665074e-05
#> 20 1865-Lincoln -1.122103e-03