Uživatel:JAnDbot/poem.py
Vzhled
skript na import incipitu - prvních veršů básně - na Wikidata.
Vytvořeno s pomocí chatGPT na základě harvest_template.py
.
Neumí transkluzi, tato verze nemá ošetřeny vložené šablony. Viz též poem2.py
#!/usr/bin/env python3
#
# (C) JAnD, ChatGPT
#
# Distributed under the terms of MIT license.
#
import re
import signal
import sys
import pywikibot
from pywikibot import pagegenerators as pg
from pywikibot import textlib, WbMonolingualText
from pywikibot.exceptions import InvalidTitleError, NoPageError
from pywikibot.bot import WikidataBot
willstop = False
def _signal_handler(signum, frame) -> None:
global willstop
if not willstop:
willstop = True
pywikibot.info('Received ctrl-c. Finishing current item; '
'press ctrl-c again to abort.')
else:
raise KeyboardInterrupt
signal.signal(signal.SIGINT, _signal_handler)
class PoemHarvestRobot(WikidataBot):
"""A bot to add Wikidata claims from the content of <poem> tags."""
def __init__(self, **kwargs) -> None:
"""Initializer."""
super().__init__(**kwargs)
def setup(self) -> None:
self.cacheSources()
def treat_page_and_item(self,
page: pywikibot.page.BasePage | None,
item: pywikibot.page.ItemPage | None) -> None:
"""Process a single page/item."""
if willstop:
raise KeyboardInterrupt
if page is None:
return
assert page is self.current_page
self.site = page.site
# Skip page if it contains {{Forma}} or {{forma}} before <poem>
# TODO: nikde není ošetřené "before"
if re.search(r'\{\{[Ff]orma\b', page.text) and '<poem>' in page.text:
pywikibot.info(f"Skipping page {page.title()} because it contains '{{Forma}}' or '{{forma}}' before <poem>")
return
# Process the page content to find the first <poem> tag
poem_content = self.extract_poem_content(page.text)
if poem_content:
self.add_claim_to_item(item, 'P1922', poem_content, 'cs')
def extract_poem_content(self, text: str) -> str:
"""Extract content of the first <poem> tag."""
poem_regex = re.compile(r'<poem>(.*?)</poem>', re.DOTALL)
match = poem_regex.search(text)
if match:
content = match.group(1).strip()
# Remove wikimarkup ([[links]]) and single quotes
# TODO: je rozdíl mezi [[Odkaz]] a [[Odkaz|Popis]]
content = re.sub(r'\[\[(.*?)\]\]', r'\1', content)
content = content.replace("'", "")
content = self.remove_templates(content)
# Split content into lines
lines = content.split('\n')
result = lines[0].strip()
# Process each line
for line in lines[1:]:
line = line.strip()
if not line:
# End of paragraph, return the result
return result
result += ' ' + line
if line.endswith(('.', '!', '?')):
# End of sentence, return the result
return result
# End of poem, return the result
return result
return ''
def remove_templates(self, text: str) -> str:
"""Remove specified templates from text."""
templates = ['Prostrkaně', 'uprostřed', 'vlevo', 'vpravo', 'verzálky']
for template in templates:
text = re.sub(r'\{\{' + re.escape(template) + r'[^}]*\}\}', '', text, flags=re.IGNORECASE)
return text
def add_claim_to_item(self, item: pywikibot.page.ItemPage, property_id: str, value: str, language: str) -> None:
"""Add a monolingual text claim to the item."""
existing_claim = None
for claim in item.claims.get(property_id, []):
target = claim.getTarget()
if target and target.text.startswith(value[:50]):
existing_claim = claim
break
if existing_claim:
if len(existing_claim.getTarget().text) < len(value):
target = WbMonolingualText(text=value, language=language)
existing_claim.changeTarget(target)
pywikibot.info(f'Updated existing claim {property_id} with new value "{value}"')
if not existing_claim.sources:
source_claim = self.getSource(self.site)
existing_claim.addSource(source_claim)
pywikibot.info(f'Added source to existing claim {property_id} with value "{value}"')
else:
# Check for existing claims with the same start
for claim in item.claims.get(property_id, []):
if claim.getTarget().text.startswith(value[:50]):
if len(claim.getTarget().text) < len(value):
item.removeClaim(claim)
pywikibot.info(f'Removed shorter claim {property_id} with value "{claim.getTarget().text}"')
else:
return
claim = pywikibot.Claim(self.repo, property_id)
target = WbMonolingualText(text=value, language=language)
claim.setTarget(target)
source_claim = self.getSource(self.site)
claim.addSource(source_claim)
item.addClaim(claim)
pywikibot.info(f'Added claim {property_id} with value "{value}" in language "{language}"')
def main(*args: str) -> None:
gen = pg.GeneratorFactory()
local_args = pywikibot.handle_args(args)
for arg in local_args:
if not gen.handle_arg(arg):
pass # Ignore unhandled arguments
if not gen.gens:
pywikibot.error('No generator specified.')
return
generator = gen.getCombinedGenerator(preload=True)
bot = PoemHarvestRobot(generator=generator)
bot.run()
if __name__ == '__main__':
main()