Topic modelling med gensim og dhlab

2.2. Topic modelling med gensim og dhlab#

import dhlab as dh
import pandas as pd
import gensim
import pyLDAvis
from pprint import pprint
import pyLDAvis.gensim_models as genvis
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[1], line 3
      1 import dhlab as dh
      2 import pandas as pd
----> 3 import gensim
      4 import pyLDAvis
      5 from pprint import pprint

ModuleNotFoundError: No module named 'gensim'
# sprit urn URN:NBN:no-nb_digibok_2020090207537

Finn en bok

bok = dh.Corpus(doctype="digibok", limit=1, title="Norgeshistorie")
bok
dhlabid urn title authors oaiid sesamid isbn10 city timestamp year publisher langs subjects ddc genres literaryform doctype ocr_creator ocr_timestamp
0 100441759 URN:NBN:no-nb_digibok_2009111200018 Elevens ressursbok : verdens- og Norgeshistori... Berger , Finn / Henningsen , Rune / Aass , Ole oai:nb.bibsys.no:999721559964702202 9d569a267470cda9851b718c9bdd20f0 8200423387 19970101 1997 Universitetsforl. nob Eldre tid / Tidlig nytid / Historie / Verden /... Faglitteratur digibok nb 20060101
urn = bok.frame.urn.tolist()[0]
urn
'URN:NBN:no-nb_digibok_2009111200018'

2.2.1. Chunking#

# Chunks 
res = dh.Chunks(chunks=1000, urn=urn)
len(res.chunks)
31
def chunks_to_corpus(chunks_list):
    res = []
    for x in chunks_list:
        inner_res = ""
        for y in x:
            inner_res += (y + " ") * x[y]
            
        res.append(inner_res)
    return res
texts = chunks_to_corpus(res.chunks)

2.2.2. Find delta TFIDF#

df = pd.DataFrame(res.chunks).transpose().fillna(0)
df
0 1 2 3 4 5 6 7 8 9 ... 21 22 23 24 25 26 27 28 29 30
( 8.0 0.0 4.0 1.0 4.0 4.0 2.0 4.0 11.0 4.0 ... 2.0 3.0 2.0 6.0 4.0 1.0 5.0 3.0 2.0 1.0
) 8.0 0.0 4.0 1.0 4.0 4.0 2.0 4.0 11.0 4.0 ... 2.0 3.0 2.0 5.0 5.0 1.0 5.0 3.0 2.0 1.0
, 49.0 36.0 44.0 60.0 43.0 39.0 55.0 42.0 20.0 55.0 ... 38.0 39.0 40.0 45.0 58.0 38.0 32.0 41.0 42.0 38.0
- 8.0 5.0 2.0 5.0 4.0 3.0 3.0 8.0 3.0 6.0 ... 4.0 10.0 9.0 1.0 2.0 1.0 5.0 20.0 3.0 0.0
. 22.0 38.0 25.0 27.0 25.0 25.0 37.0 24.0 32.0 36.0 ... 27.0 46.0 32.0 41.0 17.0 38.0 36.0 31.0 41.0 40.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
verdighet 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0
virkelige 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0
Ørkenen 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0
årer 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0
ødela 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0

6844 rows × 31 columns

# Import top 50000 tokens from NB collection
tot = dh.totals(50000)
# Divide corpus freq count by tot
res = df.sum(axis=1) /  tot.freq
# Get top 1000 more frequent tokens
target_tokens = res.sort_values(ascending=False).iloc[:1000].dropna().index
df.loc[target_tokens]
0 1 2 3 4 5 6 7 8 9 ... 21 22 23 24 25 26 27 28 29 30
Forstå 0.0 3.0 3.0 0.0 2.0 0.0 0.0 0.0 0.0 6.0 ... 3.0 3.0 1.0 0.0 1.0 4.0 0.0 1.0 0.0 0.0
FØR 3.0 2.0 3.0 3.0 4.0 2.0 2.0 2.0 2.0 3.0 ... 4.0 2.0 2.0 2.0 3.0 3.0 1.0 3.0 1.0 1.0
vikingene 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 2.0 0.0 0.0 0.0 0.0 0.0 0.0
læreboka 0.0 0.0 1.0 0.0 1.0 1.0 1.0 0.0 1.0 0.0 ... 2.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0
1850 4.0 2.0 3.0 3.0 4.0 2.0 2.0 2.0 2.0 3.0 ... 4.0 4.0 2.0 2.0 3.0 3.0 1.0 4.0 1.0 1.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
fortrinn 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
hustruen 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
liga 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
skitt 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
van- 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

1000 rows × 31 columns

outer_lst = []

for col in df.loc[target_tokens].columns:
    inner_str = ""
    for i, x in df.loc[target_tokens][col].items():
        if x > 0:
            inner_str += (i + " ") * int(x)
        
    outer_lst.append(inner_str)
len(outer_lst)
31

2.2.3. Prep for LDA#

data = [x.split() for x in outer_lst]
id2word = gensim.corpora.Dictionary(data)
corpus = [id2word.doc2bow(chunk) for chunk in data]

2.2.4. Make model#

lda_model = gensim.models.LdaMulticore(
    corpus = corpus,
    id2word = id2word,
    num_topics = 10
)
pprint(lda_model.print_topics())
[(0,
  '0.022*"brenning" + 0.015*"1650" + 0.014*"faget" + 0.014*"1850" + '
  '0.014*"Hva" + 0.014*"FØR" + 0.011*"målene" + 0.011*"EUROPA" + 0.010*"NYE" + '
  '0.010*"hersker"'),
 (1,
  '0.022*"Hva" + 0.017*"1850" + 0.013*"garden" + 0.012*"Kilde" + '
  '0.011*"revolusjon" + 0.011*"sådde" + 0.010*"slags" + 0.009*"industrielle" + '
  '0.009*"skipet" + 0.008*"FØR"'),
 (2,
  '0.020*"1850" + 0.015*"Hva" + 0.015*"FØR" + 0.014*"garden" + '
  '0.012*"Kapittel" + 0.011*"Kilde" + 0.010*"sådde" + 0.009*"Forstå" + '
  '0.009*"slags" + 0.008*"1814"'),
 (3,
  '0.013*"faget" + 0.009*"målene" + 0.007*"Vurdering" + 0.006*"kjennskap" + '
  '0.006*"menneskene" + 0.006*"middels" + 0.005*"Lag" + 0.005*"læreplanen" + '
  '0.005*"læreren" + 0.005*"informere"'),
 (4,
  '0.032*"Hva" + 0.025*"Hvordan" + 0.023*"1850" + 0.020*"FØR" + '
  '0.016*"Oppgave" + 0.016*"Forstå" + 0.015*"Spørsmål" + 0.014*"slags" + '
  '0.010*"Hvilke" + 0.010*"perioden"'),
 (5,
  '0.031*"Hva" + 0.014*"1850" + 0.014*"slags" + 0.014*"Hvordan" + '
  '0.014*"Spørsmål" + 0.012*"FØR" + 0.012*"skipet" + 0.010*"Hvilke" + '
  '0.010*"Kilde" + 0.009*"vikingene"'),
 (6,
  '0.022*"Hvordan" + 0.021*"Hva" + 0.020*"FØR" + 0.019*"1850" + 0.012*"1814" + '
  '0.011*"Guri" + 0.011*"Sønsteby" + 0.010*"Kilde" + 0.010*"Spørsmål" + '
  '0.010*"slags"'),
 (7,
  '0.042*"Kapittel" + 0.032*"Hva" + 0.022*"1850" + 0.012*"FØR" + '
  '0.012*"samfunnet" + 0.012*"revolusjon" + 0.011*"Hvordan" + 0.010*"slags" + '
  '0.010*"læreplanen" + 0.010*"faget"'),
 (8,
  '0.045*"Hva" + 0.028*"1850" + 0.026*"FØR" + 0.019*"Hvordan" + '
  '0.019*"Spørsmål" + 0.017*"Kilde" + 0.016*"slags" + 0.016*"Oppgave" + '
  '0.013*"Forstå" + 0.010*"1814"'),
 (9,
  '0.027*"Danmarks" + 0.020*"Kilde" + 0.013*"Hva" + 0.012*"Hvordan" + '
  '0.011*"Spørsmål" + 0.011*"FØR" + 0.010*"rikes" + 0.010*"udi" + 0.009*"1850" '
  '+ 0.009*"raad"')]
prep = genvis.prepare(lda_model, corpus, id2word)
# Save to html
# pyLDAvis.save_html(prep, "result.html")
pyLDAvis.enable_notebook()
pyLDAvis.display(prep)