2.2. Topic modelling med gensim og dhlab#
import dhlab as dh
import pandas as pd
import gensim
import pyLDAvis
from pprint import pprint
import pyLDAvis.gensim_models as genvis
# sprit urn URN:NBN:no-nb_digibok_2020090207537
Finn en bok
bok = dh.Corpus(doctype="digibok", limit=1, title="Norgeshistorie")
bok
| dhlabid | urn | title | authors | oaiid | sesamid | isbn10 | city | timestamp | year | publisher | langs | subjects | ddc | genres | literaryform | doctype | ocr_creator | ocr_timestamp | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 100284219 | URN:NBN:no-nb_digibok_2016021608256 | Norgeshistorie etter 1850 , VK II : lærebok | Olstad , Finn | oai:nb.bibsys.no:999408838804702202 | 784f7a0ffde006a7f606ed4f65b29202 | 8200408442 | 19940101 | 1994 | Universitetsforl. | nob | Historie / historie / norge | Faglitteratur | digibok | nb | 20060101 |
urn = bok.frame.urn.tolist()[0]
urn
'URN:NBN:no-nb_digibok_2016021608256'
2.2.1. Chunking#
# Chunks
res = dh.Chunks(chunks=1000, urn=urn)
len(res.chunks)
116
def chunks_to_corpus(chunks_list):
res = []
for x in chunks_list:
inner_res = ""
for y in x:
inner_res += (y + " ") * x[y]
res.append(inner_res)
return res
texts = chunks_to_corpus(res.chunks)
2.2.2. Find delta TFIDF#
df = pd.DataFrame(res.chunks).transpose().fillna(0)
df
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| $ | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 |
| ( | 3.0 | 2.0 | 59.0 | 2.0 | 3.0 | 0.0 | 9.0 | 3.0 | 3.0 | 0.0 | ... | 7.0 | 3.0 | 6.0 | 1.0 | 2.0 | 4.0 | 3.0 | 4.0 | 1.0 | 5.0 |
| ) | 2.0 | 2.0 | 58.0 | 2.0 | 3.0 | 0.0 | 10.0 | 3.0 | 3.0 | 0.0 | ... | 7.0 | 3.0 | 4.0 | 1.0 | 2.0 | 3.0 | 1.0 | 4.0 | 2.0 | 5.0 |
| , | 44.0 | 25.0 | 41.0 | 30.0 | 29.0 | 12.0 | 75.0 | 27.0 | 29.0 | 34.0 | ... | 22.0 | 32.0 | 27.0 | 34.0 | 24.0 | 42.0 | 34.0 | 38.0 | 45.0 | 46.0 |
| - | 2.0 | 4.0 | 52.0 | 9.0 | 3.0 | 3.0 | 14.0 | 12.0 | 26.0 | 10.0 | ... | 12.0 | 13.0 | 15.0 | 6.0 | 11.0 | 6.0 | 15.0 | 13.0 | 6.0 | 8.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| vekkelsesbevegelsen | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| venstrepolitiker | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| vestnorske | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| årsakssammenhenger | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| ætten | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
17083 rows × 116 columns
# Import top 50000 tokens from NB collection
tot = dh.totals(50000)
# Divide corpus freq count by tot
res = df.sum(axis=1) / tot.freq
# Get top 1000 more frequent tokens
target_tokens = res.sort_values(ascending=False).iloc[:1000].dropna().index
df.loc[target_tokens]
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| bidrog | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 2.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 |
| Eyde | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| regjeringsmakten | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| samfunnsutviklingen | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| Nygaardsvold | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 2.0 | 0.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| harme | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| statlige | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| oppløst | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| spasere | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| Offentlig | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
1000 rows × 116 columns
outer_lst = []
for col in df.loc[target_tokens].columns:
inner_str = ""
for i, x in df.loc[target_tokens][col].items():
if x > 0:
inner_str += (i + " ") * int(x)
outer_lst.append(inner_str)
len(outer_lst)
116
2.2.3. Prep for LDA#
data = [x.split() for x in outer_lst]
id2word = gensim.corpora.Dictionary(data)
corpus = [id2word.doc2bow(chunk) for chunk in data]
2.2.4. Make model#
lda_model = gensim.models.LdaMulticore(
corpus = corpus,
id2word = id2word,
num_topics = 10
)
/usr/lib/python3.12/multiprocessing/popen_fork.py:66: DeprecationWarning: This process (pid=10427) is multi-threaded, use of fork() may lead to deadlocks in the child.
self.pid = os.fork()
/usr/lib/python3.12/multiprocessing/popen_fork.py:66: DeprecationWarning: This process (pid=10427) is multi-threaded, use of fork() may lead to deadlocks in the child.
self.pid = os.fork()
/usr/lib/python3.12/multiprocessing/popen_fork.py:66: DeprecationWarning: This process (pid=10427) is multi-threaded, use of fork() may lead to deadlocks in the child.
self.pid = os.fork()
/usr/lib/python3.12/multiprocessing/popen_fork.py:66: DeprecationWarning: This process (pid=10427) is multi-threaded, use of fork() may lead to deadlocks in the child.
self.pid = os.fork()
/usr/lib/python3.12/multiprocessing/popen_fork.py:66: DeprecationWarning: This process (pid=10427) is multi-threaded, use of fork() may lead to deadlocks in the child.
self.pid = os.fork()
/usr/lib/python3.12/multiprocessing/popen_fork.py:66: DeprecationWarning: This process (pid=10427) is multi-threaded, use of fork() may lead to deadlocks in the child.
self.pid = os.fork()
/usr/lib/python3.12/multiprocessing/popen_fork.py:66: DeprecationWarning: This process (pid=10427) is multi-threaded, use of fork() may lead to deadlocks in the child.
self.pid = os.fork()
pprint(lda_model.print_topics())
[(0,
'0.037*"årene" + 0.014*"Arbeiderpartiet" + 0.013*"Stortinget" + '
'0.009*"sosiale" + 0.009*"samfunnet" + 0.008*"krigen" + 0.008*"utviklingen" '
'+ 0.008*"arbeidere" + 0.008*"Kilde" + 0.008*"preget"'),
(1,
'0.030*"årene" + 0.016*"samiske" + 0.013*"1960" + 0.012*"samene" + '
'0.011*"Eyde" + 0.010*"1970" + 0.010*"Hvorfor" + 0.010*"Rjukan" + '
'0.009*"kvinnene" + 0.008*"gav"'),
(2,
'0.023*"årene" + 0.017*"Arbeiderpartiet" + 0.012*"regjeringen" + 0.012*"gav" '
'+ 0.011*"Gerhardsen" + 0.010*"unionen" + 0.009*"kvinnene" + 0.008*"122" + '
'0.008*"181" + 0.008*"127"'),
(3,
'0.039*"årene" + 0.016*"krigen" + 0.015*"regjeringen" + 0.015*"Stortinget" + '
'0.013*"gav" + 0.012*"tyskerne" + 0.010*"1970" + 0.010*"Hvordan" + '
'0.010*"1800" + 0.009*"bidrog"'),
(4,
'0.035*"årene" + 0.017*"arbeiderne" + 0.011*"gav" + 0.010*"Hvordan" + '
'0.010*"1970" + 0.008*"Arbeiderpartiet" + 0.008*"arbeidere" + '
'0.007*"samfunnet" + 0.007*"1960" + 0.007*"sosiale"'),
(5,
'0.024*"årene" + 0.019*"Venstre" + 0.013*"regjeringen" + 0.012*"Hvordan" + '
'0.012*"abort" + 0.011*"næringslivet" + 0.010*"EF" + 0.010*"samfunnet" + '
'0.010*"Hvorfor" + 0.009*"Stortinget"'),
(6,
'0.036*"NS" + 0.031*"krigen" + 0.023*"tyskerne" + 0.014*"årene" + '
'0.012*"Hvordan" + 0.011*"okkupasjonen" + 0.011*"Nasjonal" + '
'0.011*"motstand" + 0.010*"Arbeiderpartiet" + 0.009*"utviklingen"'),
(7,
'0.035*"0,1" + 0.016*"regjeringen" + 0.014*"tyskerne" + 0.014*"årene" + '
'0.013*"NS" + 0.012*"Kilde" + 0.011*"0,2" + 0.010*"0,3" + 0.010*"motstand" + '
'0.010*"motstanden"'),
(8,
'0.019*"krigen" + 0.017*"NS" + 0.015*"Hvordan" + 0.014*"Kilde" + '
'0.014*"kilder" + 0.014*"Hvorfor" + 0.013*"tyskerne" + 0.012*"muntlige" + '
'0.011*"medlemskap" + 0.011*"Arbeiderpartiet"'),
(9,
'0.025*"Gerhardsen" + 0.016*"årene" + 0.013*"Hvorfor" + '
'0.012*"Arbeiderpartiet" + 0.012*"landene" + 0.012*"EF" + 0.011*"vestlige" + '
'0.010*"1960" + 0.010*"1945" + 0.009*"krigen"')]
prep = genvis.prepare(lda_model, corpus, id2word)
# Save to html
# pyLDAvis.save_html(prep, "result.html")
pyLDAvis.enable_notebook()
pyLDAvis.display(prep)