2.2. Topic modelling med gensim og dhlab#
import dhlab as dh
import pandas as pd
import gensim
import pyLDAvis
from pprint import pprint
import pyLDAvis.gensim_models as genvis
# sprit urn URN:NBN:no-nb_digibok_2020090207537
Finn en bok
bok = dh.Corpus(doctype="digibok", limit=1, title="Norgeshistorie")
bok
dhlabid | urn | title | authors | oaiid | sesamid | isbn10 | city | timestamp | year | publisher | langs | subjects | ddc | genres | literaryform | doctype | ocr_creator | ocr_timestamp | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 100523307 | URN:NBN:no-nb_digibok_2010113005048 | Norgeshistorie etter 1850 : lærebok | Olstad , Finn | oai:nb.bibsys.no:999812020474702202 | 874efa0171bcfd8dae972d4880ad6981 | 8200427943 | 19980101 | 1998 | Universitetsforl. | nob | Historie / Eldre tid / Tidlig nytid / Verden /... | Faglitteratur | digibok | nb | 20060101 |
urn = bok.frame.urn.tolist()[0]
urn
'URN:NBN:no-nb_digibok_2010113005048'
2.2.1. Chunking#
# Chunks
res = dh.Chunks(chunks=1000, urn=urn)
len(res.chunks)
112
def chunks_to_corpus(chunks_list):
res = []
for x in chunks_list:
inner_res = ""
for y in x:
inner_res += (y + " ") * x[y]
res.append(inner_res)
return res
texts = chunks_to_corpus(res.chunks)
2.2.2. Find delta TFIDF#
df = pd.DataFrame(res.chunks).transpose().fillna(0)
df
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
( | 7.0 | 5.0 | 4.0 | 8.0 | 3.0 | 1.0 | 4.0 | 6.0 | 4.0 | 3.0 | ... | 6.0 | 5.0 | 5.0 | 5.0 | 10.0 | 4.0 | 4.0 | 5.0 | 0.0 | 4.0 |
) | 6.0 | 6.0 | 4.0 | 8.0 | 3.0 | 1.0 | 4.0 | 5.0 | 5.0 | 3.0 | ... | 6.0 | 5.0 | 5.0 | 4.0 | 11.0 | 4.0 | 3.0 | 5.0 | 0.0 | 4.0 |
, | 19.0 | 26.0 | 32.0 | 25.0 | 38.0 | 34.0 | 24.0 | 22.0 | 36.0 | 45.0 | ... | 21.0 | 39.0 | 39.0 | 32.0 | 18.0 | 23.0 | 28.0 | 18.0 | 238.0 | 22.0 |
- | 14.0 | 7.0 | 8.0 | 9.0 | 10.0 | 7.0 | 11.0 | 8.0 | 9.0 | 8.0 | ... | 15.0 | 4.0 | 8.0 | 26.0 | 8.0 | 7.0 | 13.0 | 16.0 | 2.0 | 2.0 |
. | 58.0 | 50.0 | 51.0 | 46.0 | 48.0 | 53.0 | 61.0 | 46.0 | 51.0 | 55.0 | ... | 56.0 | 53.0 | 50.0 | 38.0 | 49.0 | 57.0 | 46.0 | 52.0 | 2.0 | 60.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
villspor | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
Øyenvitneskildringer | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
øker | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
ørliten | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
øyenvitneskildringer | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 3.0 |
15638 rows × 112 columns
# Import top 50000 tokens from NB collection
tot = dh.totals(50000)
# Divide corpus freq count by tot
res = df.sum(axis=1) / tot.freq
# Get top 1000 more frequent tokens
target_tokens = res.sort_values(ascending=False).iloc[:1000].dropna().index
df.loc[target_tokens]
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
bidrog | 2.0 | 1.0 | 0.0 | 0.0 | 0.0 | 2.0 | 1.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 2.0 | 1.0 | 1.0 | 0.0 | 0.0 |
fortida | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
Eyde | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 8.0 | 0.0 | 0.0 | 0.0 | 0.0 |
Nygaardsvold | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 6.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
regjeringsmakten | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
partienes | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
Særlig | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
885 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
arbeids- | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
251 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
1000 rows × 112 columns
outer_lst = []
for col in df.loc[target_tokens].columns:
inner_str = ""
for i, x in df.loc[target_tokens][col].items():
if x > 0:
inner_str += (i + " ") * int(x)
outer_lst.append(inner_str)
len(outer_lst)
112
2.2.3. Prep for LDA#
data = [x.split() for x in outer_lst]
id2word = gensim.corpora.Dictionary(data)
corpus = [id2word.doc2bow(chunk) for chunk in data]
2.2.4. Make model#
lda_model = gensim.models.LdaMulticore(
corpus = corpus,
id2word = id2word,
num_topics = 10
)
pprint(lda_model.print_topics())
[(0,
'0.024*"årene" + 0.016*"Gerhardsen" + 0.015*"Arbeiderpartiet" + '
'0.015*"Stortinget" + 0.015*"1945" + 0.014*"1965" + 0.011*"økonomiske" + '
'0.011*"økonomisk" + 0.010*"landene" + 0.010*"vestlige"'),
(1,
'0.019*"NS" + 0.014*"sosiale" + 0.013*"Nasjonal" + 0.013*"årene" + '
'0.013*"krigen" + 0.013*"1945" + 0.011*"arbeiderne" + 0.011*"økonomiske" + '
'0.011*"1914" + 0.011*"1900"'),
(2,
'0.025*"årene" + 0.016*"arbeiderne" + 0.015*"gav" + 0.013*"samfunnet" + '
'0.013*"1914" + 0.012*"Arbeiderpartiet" + 0.011*"bidrog" + 0.010*"1900" + '
'0.010*"førte" + 0.010*"revolusjonen"'),
(3,
'0.037*"årene" + 0.025*"kilder" + 0.014*"regjeringen" + 0.014*"1940" + '
'0.013*"krigen" + 0.013*"1970" + 0.013*"Stortinget" + 0.011*"1980" + '
'0.010*"1945" + 0.010*"1965"'),
(4,
'0.035*"årene" + 0.017*"Stortinget" + 0.014*"regjeringen" + 0.013*"1970" + '
'0.012*"1905" + 0.012*"CA" + 0.010*"stemmerett" + 0.010*"Venstre" + '
'0.010*"gav" + 0.010*"fortida"'),
(5,
'0.031*"krigen" + 0.023*"1940" + 0.020*"tyskerne" + 0.020*"1920" + '
'0.018*"årene" + 0.018*"økonomiske" + 0.018*"1914" + 0.011*"1945" + '
'0.011*"førte" + 0.011*"Nansen"'),
(6,
'0.011*"årene" + 0.010*"270" + 0.010*"gav" + 0.009*"263" + 0.009*"tyskerne" '
'+ 0.009*"økonomisk" + 0.009*"regjeringen" + 0.008*"krigen" + '
'0.008*"stemmerett" + 0.008*"271"'),
(7,
'0.035*"årene" + 0.015*"1945" + 0.015*"Arbeiderpartiet" + 0.014*"vekst" + '
'0.013*"økonomisk" + 0.013*"regjeringen" + 0.012*"1940" + 0.011*"industrien" '
'+ 0.010*"1980" + 0.010*"tyskerne"'),
(8,
'0.049*"årene" + 0.019*"gav" + 0.017*"Arbeiderpartiet" + 0.015*"1970" + '
'0.014*"regjeringen" + 0.014*"økonomiske" + 0.013*"1965" + 0.012*"1980" + '
'0.010*"1960" + 0.010*"sosiale"'),
(9,
'0.041*"årene" + 0.020*"1940" + 0.017*"1945" + 0.014*"krigen" + 0.011*"1914" '
'+ 0.010*"1930" + 0.009*"vokste" + 0.009*"samfunnet" + 0.009*"1990" + '
'0.009*"regjeringen"')]
prep = genvis.prepare(lda_model, corpus, id2word)
# Save to html
# pyLDAvis.save_html(prep, "result.html")
pyLDAvis.enable_notebook()
pyLDAvis.display(prep)