2.2. Topic modelling med gensim og dhlab#
import dhlab as dh
import pandas as pd
import gensim
import pyLDAvis
from pprint import pprint
import pyLDAvis.gensim_models as genvis
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
Cell In[1], line 3
1 import dhlab as dh
2 import pandas as pd
----> 3 import gensim
4 import pyLDAvis
5 from pprint import pprint
ModuleNotFoundError: No module named 'gensim'
# sprit urn URN:NBN:no-nb_digibok_2020090207537
Finn en bok
bok = dh.Corpus(doctype="digibok", limit=1, title="Norgeshistorie")
bok
dhlabid | urn | title | authors | oaiid | sesamid | isbn10 | city | timestamp | year | publisher | langs | subjects | ddc | genres | literaryform | doctype | ocr_creator | ocr_timestamp | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 100441759 | URN:NBN:no-nb_digibok_2009111200018 | Elevens ressursbok : verdens- og Norgeshistori... | Berger , Finn / Henningsen , Rune / Aass , Ole | oai:nb.bibsys.no:999721559964702202 | 9d569a267470cda9851b718c9bdd20f0 | 8200423387 | 19970101 | 1997 | Universitetsforl. | nob | Eldre tid / Tidlig nytid / Historie / Verden /... | Faglitteratur | digibok | nb | 20060101 |
urn = bok.frame.urn.tolist()[0]
urn
'URN:NBN:no-nb_digibok_2009111200018'
2.2.1. Chunking#
# Chunks
res = dh.Chunks(chunks=1000, urn=urn)
len(res.chunks)
31
def chunks_to_corpus(chunks_list):
res = []
for x in chunks_list:
inner_res = ""
for y in x:
inner_res += (y + " ") * x[y]
res.append(inner_res)
return res
texts = chunks_to_corpus(res.chunks)
2.2.2. Find delta TFIDF#
df = pd.DataFrame(res.chunks).transpose().fillna(0)
df
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
( | 8.0 | 0.0 | 4.0 | 1.0 | 4.0 | 4.0 | 2.0 | 4.0 | 11.0 | 4.0 | ... | 2.0 | 3.0 | 2.0 | 6.0 | 4.0 | 1.0 | 5.0 | 3.0 | 2.0 | 1.0 |
) | 8.0 | 0.0 | 4.0 | 1.0 | 4.0 | 4.0 | 2.0 | 4.0 | 11.0 | 4.0 | ... | 2.0 | 3.0 | 2.0 | 5.0 | 5.0 | 1.0 | 5.0 | 3.0 | 2.0 | 1.0 |
, | 49.0 | 36.0 | 44.0 | 60.0 | 43.0 | 39.0 | 55.0 | 42.0 | 20.0 | 55.0 | ... | 38.0 | 39.0 | 40.0 | 45.0 | 58.0 | 38.0 | 32.0 | 41.0 | 42.0 | 38.0 |
- | 8.0 | 5.0 | 2.0 | 5.0 | 4.0 | 3.0 | 3.0 | 8.0 | 3.0 | 6.0 | ... | 4.0 | 10.0 | 9.0 | 1.0 | 2.0 | 1.0 | 5.0 | 20.0 | 3.0 | 0.0 |
. | 22.0 | 38.0 | 25.0 | 27.0 | 25.0 | 25.0 | 37.0 | 24.0 | 32.0 | 36.0 | ... | 27.0 | 46.0 | 32.0 | 41.0 | 17.0 | 38.0 | 36.0 | 31.0 | 41.0 | 40.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
verdighet | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
virkelige | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
Ørkenen | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
årer | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
ødela | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
6844 rows × 31 columns
# Import top 50000 tokens from NB collection
tot = dh.totals(50000)
# Divide corpus freq count by tot
res = df.sum(axis=1) / tot.freq
# Get top 1000 more frequent tokens
target_tokens = res.sort_values(ascending=False).iloc[:1000].dropna().index
df.loc[target_tokens]
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Forstå | 0.0 | 3.0 | 3.0 | 0.0 | 2.0 | 0.0 | 0.0 | 0.0 | 0.0 | 6.0 | ... | 3.0 | 3.0 | 1.0 | 0.0 | 1.0 | 4.0 | 0.0 | 1.0 | 0.0 | 0.0 |
FØR | 3.0 | 2.0 | 3.0 | 3.0 | 4.0 | 2.0 | 2.0 | 2.0 | 2.0 | 3.0 | ... | 4.0 | 2.0 | 2.0 | 2.0 | 3.0 | 3.0 | 1.0 | 3.0 | 1.0 | 1.0 |
vikingene | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 2.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
læreboka | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 1.0 | 1.0 | 0.0 | 1.0 | 0.0 | ... | 2.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
1850 | 4.0 | 2.0 | 3.0 | 3.0 | 4.0 | 2.0 | 2.0 | 2.0 | 2.0 | 3.0 | ... | 4.0 | 4.0 | 2.0 | 2.0 | 3.0 | 3.0 | 1.0 | 4.0 | 1.0 | 1.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
fortrinn | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
hustruen | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
liga | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
skitt | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
van- | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
1000 rows × 31 columns
outer_lst = []
for col in df.loc[target_tokens].columns:
inner_str = ""
for i, x in df.loc[target_tokens][col].items():
if x > 0:
inner_str += (i + " ") * int(x)
outer_lst.append(inner_str)
len(outer_lst)
31
2.2.3. Prep for LDA#
data = [x.split() for x in outer_lst]
id2word = gensim.corpora.Dictionary(data)
corpus = [id2word.doc2bow(chunk) for chunk in data]
2.2.4. Make model#
lda_model = gensim.models.LdaMulticore(
corpus = corpus,
id2word = id2word,
num_topics = 10
)
pprint(lda_model.print_topics())
[(0,
'0.022*"brenning" + 0.015*"1650" + 0.014*"faget" + 0.014*"1850" + '
'0.014*"Hva" + 0.014*"FØR" + 0.011*"målene" + 0.011*"EUROPA" + 0.010*"NYE" + '
'0.010*"hersker"'),
(1,
'0.022*"Hva" + 0.017*"1850" + 0.013*"garden" + 0.012*"Kilde" + '
'0.011*"revolusjon" + 0.011*"sådde" + 0.010*"slags" + 0.009*"industrielle" + '
'0.009*"skipet" + 0.008*"FØR"'),
(2,
'0.020*"1850" + 0.015*"Hva" + 0.015*"FØR" + 0.014*"garden" + '
'0.012*"Kapittel" + 0.011*"Kilde" + 0.010*"sådde" + 0.009*"Forstå" + '
'0.009*"slags" + 0.008*"1814"'),
(3,
'0.013*"faget" + 0.009*"målene" + 0.007*"Vurdering" + 0.006*"kjennskap" + '
'0.006*"menneskene" + 0.006*"middels" + 0.005*"Lag" + 0.005*"læreplanen" + '
'0.005*"læreren" + 0.005*"informere"'),
(4,
'0.032*"Hva" + 0.025*"Hvordan" + 0.023*"1850" + 0.020*"FØR" + '
'0.016*"Oppgave" + 0.016*"Forstå" + 0.015*"Spørsmål" + 0.014*"slags" + '
'0.010*"Hvilke" + 0.010*"perioden"'),
(5,
'0.031*"Hva" + 0.014*"1850" + 0.014*"slags" + 0.014*"Hvordan" + '
'0.014*"Spørsmål" + 0.012*"FØR" + 0.012*"skipet" + 0.010*"Hvilke" + '
'0.010*"Kilde" + 0.009*"vikingene"'),
(6,
'0.022*"Hvordan" + 0.021*"Hva" + 0.020*"FØR" + 0.019*"1850" + 0.012*"1814" + '
'0.011*"Guri" + 0.011*"Sønsteby" + 0.010*"Kilde" + 0.010*"Spørsmål" + '
'0.010*"slags"'),
(7,
'0.042*"Kapittel" + 0.032*"Hva" + 0.022*"1850" + 0.012*"FØR" + '
'0.012*"samfunnet" + 0.012*"revolusjon" + 0.011*"Hvordan" + 0.010*"slags" + '
'0.010*"læreplanen" + 0.010*"faget"'),
(8,
'0.045*"Hva" + 0.028*"1850" + 0.026*"FØR" + 0.019*"Hvordan" + '
'0.019*"Spørsmål" + 0.017*"Kilde" + 0.016*"slags" + 0.016*"Oppgave" + '
'0.013*"Forstå" + 0.010*"1814"'),
(9,
'0.027*"Danmarks" + 0.020*"Kilde" + 0.013*"Hva" + 0.012*"Hvordan" + '
'0.011*"Spørsmål" + 0.011*"FØR" + 0.010*"rikes" + 0.010*"udi" + 0.009*"1850" '
'+ 0.009*"raad"')]
prep = genvis.prepare(lda_model, corpus, id2word)
# Save to html
# pyLDAvis.save_html(prep, "result.html")
pyLDAvis.enable_notebook()
pyLDAvis.display(prep)