Topic modelling med gensim og dhlab

2.2. Topic modelling med gensim og dhlab#

import dhlab as dh
import pandas as pd
import gensim
import pyLDAvis
from pprint import pprint
import pyLDAvis.gensim_models as genvis
# sprit urn URN:NBN:no-nb_digibok_2020090207537

Finn en bok

bok = dh.Corpus(doctype="digibok", limit=1, title="Norgeshistorie")
bok
dhlabid urn title authors oaiid sesamid isbn10 city timestamp year publisher langs subjects ddc genres literaryform doctype ocr_creator ocr_timestamp
0 100523307 URN:NBN:no-nb_digibok_2010113005048 Norgeshistorie etter 1850 : lærebok Olstad , Finn oai:nb.bibsys.no:999812020474702202 874efa0171bcfd8dae972d4880ad6981 8200427943 19980101 1998 Universitetsforl. nob Historie / Eldre tid / Tidlig nytid / Verden /... Faglitteratur digibok nb 20060101
urn = bok.frame.urn.tolist()[0]
urn
'URN:NBN:no-nb_digibok_2010113005048'

2.2.1. Chunking#

# Chunks 
res = dh.Chunks(chunks=1000, urn=urn)
len(res.chunks)
112
def chunks_to_corpus(chunks_list):
    res = []
    for x in chunks_list:
        inner_res = ""
        for y in x:
            inner_res += (y + " ") * x[y]
            
        res.append(inner_res)
    return res
texts = chunks_to_corpus(res.chunks)

2.2.2. Find delta TFIDF#

df = pd.DataFrame(res.chunks).transpose().fillna(0)
df
0 1 2 3 4 5 6 7 8 9 ... 102 103 104 105 106 107 108 109 110 111
( 7.0 5.0 4.0 8.0 3.0 1.0 4.0 6.0 4.0 3.0 ... 6.0 5.0 5.0 5.0 10.0 4.0 4.0 5.0 0.0 4.0
) 6.0 6.0 4.0 8.0 3.0 1.0 4.0 5.0 5.0 3.0 ... 6.0 5.0 5.0 4.0 11.0 4.0 3.0 5.0 0.0 4.0
, 19.0 26.0 32.0 25.0 38.0 34.0 24.0 22.0 36.0 45.0 ... 21.0 39.0 39.0 32.0 18.0 23.0 28.0 18.0 238.0 22.0
- 14.0 7.0 8.0 9.0 10.0 7.0 11.0 8.0 9.0 8.0 ... 15.0 4.0 8.0 26.0 8.0 7.0 13.0 16.0 2.0 2.0
. 58.0 50.0 51.0 46.0 48.0 53.0 61.0 46.0 51.0 55.0 ... 56.0 53.0 50.0 38.0 49.0 57.0 46.0 52.0 2.0 60.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
villspor 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0
Øyenvitneskildringer 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0
øker 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0
ørliten 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0
øyenvitneskildringer 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0

15638 rows × 112 columns

# Import top 50000 tokens from NB collection
tot = dh.totals(50000)
# Divide corpus freq count by tot
res = df.sum(axis=1) /  tot.freq
# Get top 1000 more frequent tokens
target_tokens = res.sort_values(ascending=False).iloc[:1000].dropna().index
df.loc[target_tokens]
0 1 2 3 4 5 6 7 8 9 ... 102 103 104 105 106 107 108 109 110 111
bidrog 2.0 1.0 0.0 0.0 0.0 2.0 1.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 2.0 1.0 1.0 0.0 0.0
fortida 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Eyde 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 8.0 0.0 0.0 0.0 0.0
Nygaardsvold 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 6.0 0.0 0.0 0.0 1.0 0.0
regjeringsmakten 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
partienes 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Særlig 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
885 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
arbeids- 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
251 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0

1000 rows × 112 columns

outer_lst = []

for col in df.loc[target_tokens].columns:
    inner_str = ""
    for i, x in df.loc[target_tokens][col].items():
        if x > 0:
            inner_str += (i + " ") * int(x)
        
    outer_lst.append(inner_str)
len(outer_lst)
112

2.2.3. Prep for LDA#

data = [x.split() for x in outer_lst]
id2word = gensim.corpora.Dictionary(data)
corpus = [id2word.doc2bow(chunk) for chunk in data]

2.2.4. Make model#

lda_model = gensim.models.LdaMulticore(
    corpus = corpus,
    id2word = id2word,
    num_topics = 10
)
pprint(lda_model.print_topics())
[(0,
  '0.024*"årene" + 0.016*"Gerhardsen" + 0.015*"Arbeiderpartiet" + '
  '0.015*"Stortinget" + 0.015*"1945" + 0.014*"1965" + 0.011*"økonomiske" + '
  '0.011*"økonomisk" + 0.010*"landene" + 0.010*"vestlige"'),
 (1,
  '0.019*"NS" + 0.014*"sosiale" + 0.013*"Nasjonal" + 0.013*"årene" + '
  '0.013*"krigen" + 0.013*"1945" + 0.011*"arbeiderne" + 0.011*"økonomiske" + '
  '0.011*"1914" + 0.011*"1900"'),
 (2,
  '0.025*"årene" + 0.016*"arbeiderne" + 0.015*"gav" + 0.013*"samfunnet" + '
  '0.013*"1914" + 0.012*"Arbeiderpartiet" + 0.011*"bidrog" + 0.010*"1900" + '
  '0.010*"førte" + 0.010*"revolusjonen"'),
 (3,
  '0.037*"årene" + 0.025*"kilder" + 0.014*"regjeringen" + 0.014*"1940" + '
  '0.013*"krigen" + 0.013*"1970" + 0.013*"Stortinget" + 0.011*"1980" + '
  '0.010*"1945" + 0.010*"1965"'),
 (4,
  '0.035*"årene" + 0.017*"Stortinget" + 0.014*"regjeringen" + 0.013*"1970" + '
  '0.012*"1905" + 0.012*"CA" + 0.010*"stemmerett" + 0.010*"Venstre" + '
  '0.010*"gav" + 0.010*"fortida"'),
 (5,
  '0.031*"krigen" + 0.023*"1940" + 0.020*"tyskerne" + 0.020*"1920" + '
  '0.018*"årene" + 0.018*"økonomiske" + 0.018*"1914" + 0.011*"1945" + '
  '0.011*"førte" + 0.011*"Nansen"'),
 (6,
  '0.011*"årene" + 0.010*"270" + 0.010*"gav" + 0.009*"263" + 0.009*"tyskerne" '
  '+ 0.009*"økonomisk" + 0.009*"regjeringen" + 0.008*"krigen" + '
  '0.008*"stemmerett" + 0.008*"271"'),
 (7,
  '0.035*"årene" + 0.015*"1945" + 0.015*"Arbeiderpartiet" + 0.014*"vekst" + '
  '0.013*"økonomisk" + 0.013*"regjeringen" + 0.012*"1940" + 0.011*"industrien" '
  '+ 0.010*"1980" + 0.010*"tyskerne"'),
 (8,
  '0.049*"årene" + 0.019*"gav" + 0.017*"Arbeiderpartiet" + 0.015*"1970" + '
  '0.014*"regjeringen" + 0.014*"økonomiske" + 0.013*"1965" + 0.012*"1980" + '
  '0.010*"1960" + 0.010*"sosiale"'),
 (9,
  '0.041*"årene" + 0.020*"1940" + 0.017*"1945" + 0.014*"krigen" + 0.011*"1914" '
  '+ 0.010*"1930" + 0.009*"vokste" + 0.009*"samfunnet" + 0.009*"1990" + '
  '0.009*"regjeringen"')]
prep = genvis.prepare(lda_model, corpus, id2word)
# Save to html
# pyLDAvis.save_html(prep, "result.html")
pyLDAvis.enable_notebook()
pyLDAvis.display(prep)