#!pip install -U dhlab
import os
import datetime as dt
import pandas as pd
import dhlab as dh
from code.sentiment import load_corpus_from_file, compute_sentiment_analysis, add_urls
2.1. Sentimentanalyse i aviskorpus#
Angi nøkkelord
Hent inn korpus
Score sentiment for forekomster av et ord
Lagre resultater i CSV-fil
# INPUT KREVES
word = "biblioteket" # Nøkkelord som skal forekomme i tekstene
2.1.1. Korpusdefinisjon#
Fyll inn filsti til lokal csv- eller excel-fil med korpusdefinisjon:
# INPUT KREVES
file_path = "FYLL INN"
Eventuelt definer et korpus direkte med parametere.
# INPUT KREVES
city="Kristiansand" # Publiseringssted for avisene
from_year=2000 # Start for tidsperiode det søkes i
to_year=2022 # Slutt for tidsperiode
number_of_docs=10000 # Maks antall dokumenter i tekstutvalget
# corpus-variabelen peker til korpusdefinisjonen, som et dhlab.Corpus-objekt
if os.path.exists(file_path):
corpus = load_corpus_from_file(file_path)
print("Lastet corpus fra fil: ", file_path)
else:
corpus = dh.Corpus(
doctype="digavis",
fulltext=word,
freetext=f"city: {city}",
from_year=from_year,
to_year = to_year,
limit=number_of_docs
)
print("Lastet corpus fra parametere: ")
print(f"Aviser, utgitt mellom {from_year} og {to_year} i {city}, der ordet \"{word}\" forekommer.")
print(f"corpus består av {corpus.size} utgivelser")
Lastet corpus fra parametere:
Aviser, utgitt mellom 2000 og 2022 i Kristiansand, der ordet "biblioteket" forekommer.
corpus består av 3441 utgivelser
Valgfritt: Sett save_to_file=True
hvis du vil lagre resulterende korpus til en ny, datostemplet CSV-fil
# INPUT KREVES
save_to_file=False # Hvorvidt tekstutvalget skal lagres som CSV-fil, til senere bruk
if save_to_file:
new_file_path = f"corpus_avis_{corpus.frame.year.min()}_{corpus.frame.year.max()}_{city}_{word}_created{dt.date.today()}.csv"
corpus.to_csv(new_file_path)
2.1.2. Sentimentscore#
Kontekstene der nøkkelordet forekommer hentes ut fra korpuset, og vi beregner er sentimentscore på hvert tekstutdrag.
Det er en enkel analyse, med positive og negative ordlister: Vi regner ut differansen mellom summene av positive og negative ord i konteksten.
Ordlistene vi bruker her, NorSentLex, er utviklet av LTG-gruppen ved UiO ifm. SANT-prosjektet.
# Beregn en sentimentscore til kontekstene der nøkkelordet forekommer.
result = compute_sentiment_analysis(corpus, word)
---------------------------------------------------------------------------
KeyboardInterrupt Traceback (most recent call last)
Cell In[7], line 2
1 # Beregn en sentimentscore til kontekstene der nøkkelordet forekommer.
----> 2 result = compute_sentiment_analysis(corpus, word)
File ~/projects/digital_tekstanalyse/cookbook/code/sentiment.py:260, in compute_sentiment_analysis(*args, **kwargs)
258 def compute_sentiment_analysis(*args, **kwargs):
259 """Compute sentiment score on the input data."""
--> 260 return count_and_score_target_words(*args, **kwargs)
File ~/projects/digital_tekstanalyse/cookbook/code/sentiment.py:243, in count_and_score_target_words(corpus, word)
237 word_freq = word_freq.merge(
238 conc, how="inner", left_on=docid_column, right_on=docid_column
239 )
241 pos, neg = load_norsentlex()
--> 243 word_freq[["positive", "negative"]] = word_freq.apply(
244 lambda x: score_sentiment(x.conc, pos, neg), axis=1, result_type="expand"
245 )
246 word_freq["sentimentscore"] = word_freq["positive"] - word_freq["negative"]
248 df = corpus.frame.merge(
249 word_freq.drop(columns="conc"),
250 how="inner",
251 left_on=docid_column,
252 right_on=docid_column,
253 )
File ~/.cache/pypoetry/virtualenvs/digital-tekstanalyse-kDjkoATB-py3.10/lib/python3.10/site-packages/pandas/core/frame.py:9423, in DataFrame.apply(self, func, axis, raw, result_type, args, **kwargs)
9412 from pandas.core.apply import frame_apply
9414 op = frame_apply(
9415 self,
9416 func=func,
(...)
9421 kwargs=kwargs,
9422 )
-> 9423 return op.apply().__finalize__(self, method="apply")
File ~/.cache/pypoetry/virtualenvs/digital-tekstanalyse-kDjkoATB-py3.10/lib/python3.10/site-packages/pandas/core/apply.py:678, in FrameApply.apply(self)
675 elif self.raw:
676 return self.apply_raw()
--> 678 return self.apply_standard()
File ~/.cache/pypoetry/virtualenvs/digital-tekstanalyse-kDjkoATB-py3.10/lib/python3.10/site-packages/pandas/core/apply.py:798, in FrameApply.apply_standard(self)
797 def apply_standard(self):
--> 798 results, res_index = self.apply_series_generator()
800 # wrap results
801 return self.wrap_results(results, res_index)
File ~/.cache/pypoetry/virtualenvs/digital-tekstanalyse-kDjkoATB-py3.10/lib/python3.10/site-packages/pandas/core/apply.py:814, in FrameApply.apply_series_generator(self)
811 with option_context("mode.chained_assignment", None):
812 for i, v in enumerate(series_gen):
813 # ignore SettingWithCopy here in case the user mutates
--> 814 results[i] = self.f(v)
815 if isinstance(results[i], ABCSeries):
816 # If we have a view on v, we need to make a copy because
817 # series_generator will swap out the underlying data
818 results[i] = results[i].copy(deep=False)
File ~/projects/digital_tekstanalyse/cookbook/code/sentiment.py:244, in count_and_score_target_words.<locals>.<lambda>(x)
237 word_freq = word_freq.merge(
238 conc, how="inner", left_on=docid_column, right_on=docid_column
239 )
241 pos, neg = load_norsentlex()
243 word_freq[["positive", "negative"]] = word_freq.apply(
--> 244 lambda x: score_sentiment(x.conc, pos, neg), axis=1, result_type="expand"
245 )
246 word_freq["sentimentscore"] = word_freq["positive"] - word_freq["negative"]
248 df = corpus.frame.merge(
249 word_freq.drop(columns="conc"),
250 how="inner",
251 left_on=docid_column,
252 right_on=docid_column,
253 )
File ~/projects/digital_tekstanalyse/cookbook/code/sentiment.py:217, in score_sentiment(text, positive, negative)
215 """Calculate a sentiment score for the ``text`` input."""
216 context = count_tokens(text)
--> 217 sent_counts = [
218 count_matching_tokens(context, sent_terms).counts.sum()
219 if not context.empty
220 else 0
221 for sent_terms in (positive, negative)
222 ]
223 return sent_counts
File ~/projects/digital_tekstanalyse/cookbook/code/sentiment.py:218, in <listcomp>(.0)
215 """Calculate a sentiment score for the ``text`` input."""
216 context = count_tokens(text)
217 sent_counts = [
--> 218 count_matching_tokens(context, sent_terms).counts.sum()
219 if not context.empty
220 else 0
221 for sent_terms in (positive, negative)
222 ]
223 return sent_counts
File ~/projects/digital_tekstanalyse/cookbook/code/sentiment.py:144, in count_matching_tokens(token_counts, terms)
142 def count_matching_tokens(token_counts: pd.DataFrame, terms: pd.Series) -> pd.DataFrame:
143 """Combine word counts with a series of terms."""
--> 144 target_terms = terms.join(token_counts, how="inner", on="terms")
145 return target_terms
File ~/.cache/pypoetry/virtualenvs/digital-tekstanalyse-kDjkoATB-py3.10/lib/python3.10/site-packages/pandas/core/frame.py:9729, in DataFrame.join(self, other, on, how, lsuffix, rsuffix, sort, validate)
9566 def join(
9567 self,
9568 other: DataFrame | Series | Iterable[DataFrame | Series],
(...)
9574 validate: str | None = None,
9575 ) -> DataFrame:
9576 """
9577 Join columns of another DataFrame.
9578
(...)
9727 5 K1 A5 B1
9728 """
-> 9729 return self._join_compat(
9730 other,
9731 on=on,
9732 how=how,
9733 lsuffix=lsuffix,
9734 rsuffix=rsuffix,
9735 sort=sort,
9736 validate=validate,
9737 )
File ~/.cache/pypoetry/virtualenvs/digital-tekstanalyse-kDjkoATB-py3.10/lib/python3.10/site-packages/pandas/core/frame.py:9768, in DataFrame._join_compat(self, other, on, how, lsuffix, rsuffix, sort, validate)
9758 if how == "cross":
9759 return merge(
9760 self,
9761 other,
(...)
9766 validate=validate,
9767 )
-> 9768 return merge(
9769 self,
9770 other,
9771 left_on=on,
9772 how=how,
9773 left_index=on is None,
9774 right_index=True,
9775 suffixes=(lsuffix, rsuffix),
9776 sort=sort,
9777 validate=validate,
9778 )
9779 else:
9780 if on is not None:
File ~/.cache/pypoetry/virtualenvs/digital-tekstanalyse-kDjkoATB-py3.10/lib/python3.10/site-packages/pandas/core/reshape/merge.py:162, in merge(left, right, how, on, left_on, right_on, left_index, right_index, sort, suffixes, copy, indicator, validate)
131 @Substitution("\nleft : DataFrame or named Series")
132 @Appender(_merge_doc, indents=0)
133 def merge(
(...)
146 validate: str | None = None,
147 ) -> DataFrame:
148 op = _MergeOperation(
149 left,
150 right,
(...)
160 validate=validate,
161 )
--> 162 return op.get_result(copy=copy)
File ~/.cache/pypoetry/virtualenvs/digital-tekstanalyse-kDjkoATB-py3.10/lib/python3.10/site-packages/pandas/core/reshape/merge.py:809, in _MergeOperation.get_result(self, copy)
806 if self.indicator:
807 self.left, self.right = self._indicator_pre_merge(self.left, self.right)
--> 809 join_index, left_indexer, right_indexer = self._get_join_info()
811 result = self._reindex_and_concat(
812 join_index, left_indexer, right_indexer, copy=copy
813 )
814 result = result.__finalize__(self, method=self._merge_type)
File ~/.cache/pypoetry/virtualenvs/digital-tekstanalyse-kDjkoATB-py3.10/lib/python3.10/site-packages/pandas/core/reshape/merge.py:1065, in _MergeOperation._get_join_info(self)
1061 join_index, right_indexer, left_indexer = _left_join_on_index(
1062 right_ax, left_ax, self.right_join_keys, sort=self.sort
1063 )
1064 else:
-> 1065 (left_indexer, right_indexer) = self._get_join_indexers()
1067 if self.right_index:
1068 if len(self.left) > 0:
File ~/.cache/pypoetry/virtualenvs/digital-tekstanalyse-kDjkoATB-py3.10/lib/python3.10/site-packages/pandas/core/reshape/merge.py:1038, in _MergeOperation._get_join_indexers(self)
1036 def _get_join_indexers(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:
1037 """return the join indexers"""
-> 1038 return get_join_indexers(
1039 self.left_join_keys, self.right_join_keys, sort=self.sort, how=self.how
1040 )
File ~/.cache/pypoetry/virtualenvs/digital-tekstanalyse-kDjkoATB-py3.10/lib/python3.10/site-packages/pandas/core/reshape/merge.py:1665, in get_join_indexers(left_keys, right_keys, sort, how, **kwargs)
1660 # get left & right join labels and num. of levels at each location
1661 mapped = (
1662 _factorize_keys(left_keys[n], right_keys[n], sort=sort, how=how)
1663 for n in range(len(left_keys))
1664 )
-> 1665 zipped = zip(*mapped)
1666 llab, rlab, shape = (list(x) for x in zipped)
1668 # get flat i8 keys from label lists
File ~/.cache/pypoetry/virtualenvs/digital-tekstanalyse-kDjkoATB-py3.10/lib/python3.10/site-packages/pandas/core/reshape/merge.py:1662, in <genexpr>(.0)
1658 return _get_no_sort_one_missing_indexer(left_n, False)
1660 # get left & right join labels and num. of levels at each location
1661 mapped = (
-> 1662 _factorize_keys(left_keys[n], right_keys[n], sort=sort, how=how)
1663 for n in range(len(left_keys))
1664 )
1665 zipped = zip(*mapped)
1666 llab, rlab, shape = (list(x) for x in zipped)
File ~/.cache/pypoetry/virtualenvs/digital-tekstanalyse-kDjkoATB-py3.10/lib/python3.10/site-packages/pandas/core/reshape/merge.py:2442, in _factorize_keys(lk, rk, sort, how)
2435 rlab = rizer.factorize(
2436 rk.to_numpy(na_value=1, dtype=lk.dtype.numpy_dtype), mask=rk.isna()
2437 )
2438 else:
2439 # Argument 1 to "factorize" of "ObjectFactorizer" has incompatible type
2440 # "Union[ndarray[Any, dtype[signedinteger[_64Bit]]],
2441 # ndarray[Any, dtype[object_]]]"; expected "ndarray[Any, dtype[object_]]"
-> 2442 llab = rizer.factorize(lk) # type: ignore[arg-type]
2443 rlab = rizer.factorize(rk) # type: ignore[arg-type]
2444 assert llab.dtype == np.dtype(np.intp), llab.dtype
KeyboardInterrupt:
result.head()
dhlabid | urn | title | city | timestamp | year | doctype | word | count | positive | negative | sentimentscore | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 201310916 | URN:NBN:no-nb_digavis_oestsida_null_null_20040... | oestsida | Kristiansand | 20040211 | 2004 | digavis | biblioteket | 4 | 0 | 0 | 0 |
1 | 201310916 | URN:NBN:no-nb_digavis_oestsida_null_null_20040... | oestsida | Kristiansand | 20040211 | 2004 | digavis | biblioteket | 4 | 1 | 0 | 1 |
2 | 201310916 | URN:NBN:no-nb_digavis_oestsida_null_null_20040... | oestsida | Kristiansand | 20040211 | 2004 | digavis | biblioteket | 4 | 0 | 1 | -1 |
3 | 201310897 | URN:NBN:no-nb_digavis_oestsida_null_null_20021... | oestsida | Kristiansand | 20021204 | 2002 | digavis | biblioteket | 1 | 3 | 1 | 2 |
4 | 201310918 | URN:NBN:no-nb_digavis_oestsida_null_null_20040... | oestsida | Kristiansand | 20040310 | 2004 | digavis | biblioteket | 6 | 1 | 2 | -1 |
2.1.2.1. Tilbakekobling fra datapunkt til kildetekst:#
URNene i datarammen er referanser til de scannede dokumentene.
Ved å bytte ut “URN” i følgende adresse med en reell verdi kan du lese avisen i nettbiblioteket (nb.no): https://www.nb.no/items/
URN
f.eks. https://www.nb.no/items/URN:NBN:no-nb_digavis_kristiansandavis_null_null_20100204_4_5_1
Vi har også en funksjon som konstruerer URLene og lagrer dem til en ny kolonne i datarammen. Outputet fra kodeblokken under må kopieres og limes inn i nettleseren.
row_number = 1 # Endre radnummeret etter eget ønske
result = add_urls(result)
print(result.url[row_number])
https://www.nb.no/items/URN:NBN:no-nb_digavis_oestsida_null_null_20040211_3_3_1?searchText=biblioteket
result
dhlabid | urn | title | city | timestamp | year | doctype | word | count | positive | negative | sentimentscore | url | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 201310916 | URN:NBN:no-nb_digavis_oestsida_null_null_20040... | oestsida | Kristiansand | 20040211 | 2004 | digavis | biblioteket | 4 | 0 | 0 | 0 | https://www.nb.no/items/URN:NBN:no-nb_digavis_... |
1 | 201310916 | URN:NBN:no-nb_digavis_oestsida_null_null_20040... | oestsida | Kristiansand | 20040211 | 2004 | digavis | biblioteket | 4 | 1 | 0 | 1 | https://www.nb.no/items/URN:NBN:no-nb_digavis_... |
2 | 201310916 | URN:NBN:no-nb_digavis_oestsida_null_null_20040... | oestsida | Kristiansand | 20040211 | 2004 | digavis | biblioteket | 4 | 0 | 1 | -1 | https://www.nb.no/items/URN:NBN:no-nb_digavis_... |
3 | 201310897 | URN:NBN:no-nb_digavis_oestsida_null_null_20021... | oestsida | Kristiansand | 20021204 | 2002 | digavis | biblioteket | 1 | 3 | 1 | 2 | https://www.nb.no/items/URN:NBN:no-nb_digavis_... |
4 | 201310918 | URN:NBN:no-nb_digavis_oestsida_null_null_20040... | oestsida | Kristiansand | 20040310 | 2004 | digavis | biblioteket | 6 | 1 | 2 | -1 | https://www.nb.no/items/URN:NBN:no-nb_digavis_... |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
5577 | 200483072 | URN:NBN:no-nb_digavis_faedrelandsvennen_null_n... | faedrelandsvennen | Kristiansand | 20021008 | 2002 | digavis | biblioteket | 10 | 1 | 1 | 0 | https://www.nb.no/items/URN:NBN:no-nb_digavis_... |
5578 | 200486058 | URN:NBN:no-nb_digavis_faedrelandsvennen_null_n... | faedrelandsvennen | Kristiansand | 20160830 | 2016 | digavis | biblioteket | 1 | 1 | 2 | -1 | https://www.nb.no/items/URN:NBN:no-nb_digavis_... |
5579 | 200485951 | URN:NBN:no-nb_digavis_faedrelandsvennen_null_n... | faedrelandsvennen | Kristiansand | 20160423 | 2016 | digavis | biblioteket | 2 | 1 | 0 | 1 | https://www.nb.no/items/URN:NBN:no-nb_digavis_... |
5580 | 200485951 | URN:NBN:no-nb_digavis_faedrelandsvennen_null_n... | faedrelandsvennen | Kristiansand | 20160423 | 2016 | digavis | biblioteket | 2 | 1 | 0 | 1 | https://www.nb.no/items/URN:NBN:no-nb_digavis_... |
5581 | 200482629 | URN:NBN:no-nb_digavis_faedrelandsvennen_null_n... | faedrelandsvennen | Kristiansand | 20010417 | 2001 | digavis | biblioteket | 1 | 5 | 0 | 5 | https://www.nb.no/items/URN:NBN:no-nb_digavis_... |
5582 rows × 13 columns
2.1.3. Visualiser resultatet#
r = result[["year","positive", "negative", "sentimentscore"]]
rgroup = r.groupby("year")[["sentimentscore", "positive", "negative"]].sum()
rgroup.plot()
<Axes: xlabel='year'>

2.1.4. Lagre data#
Skriv utdata til en CSV-fil på ditt lokale filsystem:
Kolonne |
Beskrivelse |
---|---|
dhlabid |
DH-labens ID-nummer for det digitale tekstobjektet (OCR-scannet tekst) i databasene |
urn |
Unique Resource Name (digitalt bilde av tekstdokumentet, tilgjengelig i nettbiblioteket) |
title |
Avistittel, navn på publikasjon |
city |
Publiseringssted (oftest en by) |
timestamp |
datostempel i ISO-format (YYYYMMDD) |
year |
årstall for publikasjonen |
doctype |
Dokumenttype (her er det bare aviser, “digavis”) |
word |
nøkkelord i tekstutdragene (konkordansene) som sentimentanalysen ble utført på |
count |
ordfrekvens: antall ganger nøkkelordet forekommer i den gitte avisutgivelsen |
positive |
antall positive ord i kontekstene nøkkelordet forekommer i |
negative |
antall negative ord i kontekstene |
sentimentscore |
differansen positiv score - negativ score |
url |
lenke til avisen i nettbiblioteket, inkl. søk på nøkkelordet |
outputfile = f"sentimentanalyse_aviskorpus_{from_year}-{to_year}_{dt.date.today()}.csv"
result.to_csv(outputfile)