Topic modelling med gensim og dhlab

2.2. Topic modelling med gensim og dhlab#

import dhlab as dh
import pandas as pd
import gensim
import pyLDAvis
from pprint import pprint
import pyLDAvis.gensim_models as genvis
# sprit urn URN:NBN:no-nb_digibok_2020090207537

Finn en bok

bok = dh.Corpus(doctype="digibok", limit=1, title="Norgeshistorie")
bok
dhlabid urn title authors oaiid sesamid isbn10 city timestamp year publisher langs subjects ddc genres literaryform doctype ocr_creator ocr_timestamp
0 100284219 URN:NBN:no-nb_digibok_2016021608256 Norgeshistorie etter 1850 , VK II : lærebok Olstad , Finn oai:nb.bibsys.no:999408838804702202 784f7a0ffde006a7f606ed4f65b29202 8200408442 19940101 1994 Universitetsforl. nob Historie / historie / norge Faglitteratur digibok nb 20060101
urn = bok.frame.urn.tolist()[0]
urn
'URN:NBN:no-nb_digibok_2016021608256'

2.2.1. Chunking#

# Chunks 
res = dh.Chunks(chunks=1000, urn=urn)
len(res.chunks)
116
def chunks_to_corpus(chunks_list):
    res = []
    for x in chunks_list:
        inner_res = ""
        for y in x:
            inner_res += (y + " ") * x[y]
            
        res.append(inner_res)
    return res
texts = chunks_to_corpus(res.chunks)

2.2.2. Find delta TFIDF#

df = pd.DataFrame(res.chunks).transpose().fillna(0)
df
0 1 2 3 4 5 6 7 8 9 ... 106 107 108 109 110 111 112 113 114 115
$ 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0
( 3.0 2.0 59.0 2.0 3.0 0.0 9.0 3.0 3.0 0.0 ... 7.0 3.0 6.0 1.0 2.0 4.0 3.0 4.0 1.0 5.0
) 2.0 2.0 58.0 2.0 3.0 0.0 10.0 3.0 3.0 0.0 ... 7.0 3.0 4.0 1.0 2.0 3.0 1.0 4.0 2.0 5.0
, 44.0 25.0 41.0 30.0 29.0 12.0 75.0 27.0 29.0 34.0 ... 22.0 32.0 27.0 34.0 24.0 42.0 34.0 38.0 45.0 46.0
- 2.0 4.0 52.0 9.0 3.0 3.0 14.0 12.0 26.0 10.0 ... 12.0 13.0 15.0 6.0 11.0 6.0 15.0 13.0 6.0 8.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
vekkelsesbevegelsen 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0
venstrepolitiker 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0
vestnorske 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0
årsakssammenhenger 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0
ætten 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0

17083 rows × 116 columns

# Import top 50000 tokens from NB collection
tot = dh.totals(50000)
# Divide corpus freq count by tot
res = df.sum(axis=1) /  tot.freq
# Get top 1000 more frequent tokens
target_tokens = res.sort_values(ascending=False).iloc[:1000].dropna().index
df.loc[target_tokens]
0 1 2 3 4 5 6 7 8 9 ... 106 107 108 109 110 111 112 113 114 115
bidrog 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 2.0 0.0 1.0 0.0 1.0 0.0 0.0
Eyde 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0
regjeringsmakten 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
samfunnsutviklingen 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Nygaardsvold 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
harme 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
statlige 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0
oppløst 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
spasere 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Offentlig 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

1000 rows × 116 columns

outer_lst = []

for col in df.loc[target_tokens].columns:
    inner_str = ""
    for i, x in df.loc[target_tokens][col].items():
        if x > 0:
            inner_str += (i + " ") * int(x)
        
    outer_lst.append(inner_str)
len(outer_lst)
116

2.2.3. Prep for LDA#

data = [x.split() for x in outer_lst]
id2word = gensim.corpora.Dictionary(data)
corpus = [id2word.doc2bow(chunk) for chunk in data]

2.2.4. Make model#

lda_model = gensim.models.LdaMulticore(
    corpus = corpus,
    id2word = id2word,
    num_topics = 10
)
/usr/lib/python3.12/multiprocessing/popen_fork.py:66: DeprecationWarning: This process (pid=10427) is multi-threaded, use of fork() may lead to deadlocks in the child.
  self.pid = os.fork()
/usr/lib/python3.12/multiprocessing/popen_fork.py:66: DeprecationWarning: This process (pid=10427) is multi-threaded, use of fork() may lead to deadlocks in the child.
  self.pid = os.fork()
/usr/lib/python3.12/multiprocessing/popen_fork.py:66: DeprecationWarning: This process (pid=10427) is multi-threaded, use of fork() may lead to deadlocks in the child.
  self.pid = os.fork()
/usr/lib/python3.12/multiprocessing/popen_fork.py:66: DeprecationWarning: This process (pid=10427) is multi-threaded, use of fork() may lead to deadlocks in the child.
  self.pid = os.fork()
/usr/lib/python3.12/multiprocessing/popen_fork.py:66: DeprecationWarning: This process (pid=10427) is multi-threaded, use of fork() may lead to deadlocks in the child.
  self.pid = os.fork()
/usr/lib/python3.12/multiprocessing/popen_fork.py:66: DeprecationWarning: This process (pid=10427) is multi-threaded, use of fork() may lead to deadlocks in the child.
  self.pid = os.fork()
/usr/lib/python3.12/multiprocessing/popen_fork.py:66: DeprecationWarning: This process (pid=10427) is multi-threaded, use of fork() may lead to deadlocks in the child.
  self.pid = os.fork()
pprint(lda_model.print_topics())
[(0,
  '0.037*"årene" + 0.014*"Arbeiderpartiet" + 0.013*"Stortinget" + '
  '0.009*"sosiale" + 0.009*"samfunnet" + 0.008*"krigen" + 0.008*"utviklingen" '
  '+ 0.008*"arbeidere" + 0.008*"Kilde" + 0.008*"preget"'),
 (1,
  '0.030*"årene" + 0.016*"samiske" + 0.013*"1960" + 0.012*"samene" + '
  '0.011*"Eyde" + 0.010*"1970" + 0.010*"Hvorfor" + 0.010*"Rjukan" + '
  '0.009*"kvinnene" + 0.008*"gav"'),
 (2,
  '0.023*"årene" + 0.017*"Arbeiderpartiet" + 0.012*"regjeringen" + 0.012*"gav" '
  '+ 0.011*"Gerhardsen" + 0.010*"unionen" + 0.009*"kvinnene" + 0.008*"122" + '
  '0.008*"181" + 0.008*"127"'),
 (3,
  '0.039*"årene" + 0.016*"krigen" + 0.015*"regjeringen" + 0.015*"Stortinget" + '
  '0.013*"gav" + 0.012*"tyskerne" + 0.010*"1970" + 0.010*"Hvordan" + '
  '0.010*"1800" + 0.009*"bidrog"'),
 (4,
  '0.035*"årene" + 0.017*"arbeiderne" + 0.011*"gav" + 0.010*"Hvordan" + '
  '0.010*"1970" + 0.008*"Arbeiderpartiet" + 0.008*"arbeidere" + '
  '0.007*"samfunnet" + 0.007*"1960" + 0.007*"sosiale"'),
 (5,
  '0.024*"årene" + 0.019*"Venstre" + 0.013*"regjeringen" + 0.012*"Hvordan" + '
  '0.012*"abort" + 0.011*"næringslivet" + 0.010*"EF" + 0.010*"samfunnet" + '
  '0.010*"Hvorfor" + 0.009*"Stortinget"'),
 (6,
  '0.036*"NS" + 0.031*"krigen" + 0.023*"tyskerne" + 0.014*"årene" + '
  '0.012*"Hvordan" + 0.011*"okkupasjonen" + 0.011*"Nasjonal" + '
  '0.011*"motstand" + 0.010*"Arbeiderpartiet" + 0.009*"utviklingen"'),
 (7,
  '0.035*"0,1" + 0.016*"regjeringen" + 0.014*"tyskerne" + 0.014*"årene" + '
  '0.013*"NS" + 0.012*"Kilde" + 0.011*"0,2" + 0.010*"0,3" + 0.010*"motstand" + '
  '0.010*"motstanden"'),
 (8,
  '0.019*"krigen" + 0.017*"NS" + 0.015*"Hvordan" + 0.014*"Kilde" + '
  '0.014*"kilder" + 0.014*"Hvorfor" + 0.013*"tyskerne" + 0.012*"muntlige" + '
  '0.011*"medlemskap" + 0.011*"Arbeiderpartiet"'),
 (9,
  '0.025*"Gerhardsen" + 0.016*"årene" + 0.013*"Hvorfor" + '
  '0.012*"Arbeiderpartiet" + 0.012*"landene" + 0.012*"EF" + 0.011*"vestlige" + '
  '0.010*"1960" + 0.010*"1945" + 0.009*"krigen"')]
prep = genvis.prepare(lda_model, corpus, id2word)
# Save to html
# pyLDAvis.save_html(prep, "result.html")
pyLDAvis.enable_notebook()
pyLDAvis.display(prep)