Uživatel:JAnDbot/poem2.py
Vzhled
skript na import incipitu - prvních veršů básně - na Wikidata. Vytvořeno s pomocí chatGPT na základě harvest_template.py. Zatím neúspěšný pokus o transkluzi, tato verze by mohla mít ošetřeny vložené šablony - netestováno.
import re
import signal
import sys
import pywikibot
from pywikibot import pagegenerators as pg
from pywikibot import textlib, WbMonolingualText
from pywikibot.exceptions import InvalidTitleError, NoPageError
from pywikibot.bot import WikidataBot
willstop = False
def _signal_handler(signum, frame) -> None:
global willstop
if not willstop:
willstop = True
pywikibot.info('Received ctrl-c. Finishing current item; '
'press ctrl-c again to abort.')
else:
raise KeyboardInterrupt
signal.signal(signal.SIGINT, _signal_handler)
class PoemHarvestRobot(WikidataBot):
"""A bot to add Wikidata claims from the content of <poem> tags."""
def __init__(self, **kwargs) -> None:
"""Initializer."""
super().__init__(**kwargs)
def treat_page_and_item(self,
page: pywikibot.page.BasePage | None,
item: pywikibot.page.ItemPage | None) -> None:
"""Process a single page/item."""
if willstop:
raise KeyboardInterrupt
if page is None:
return
assert page is self.current_page
poem_content, transcluded_page_title = self.extract_poem_content(page)
if poem_content:
if transcluded_page_title:
pywikibot.info('Obsah je transkludován')
pywikibot.info(f'Transkludovaná stránka: {transcluded_page_title}')
self.add_claim_to_item(item, 'P1922', poem_content, 'cs')
def extract_poem_content(self, page: pywikibot.page.BasePage) -> tuple[str, str]:
"""Extract content of the first <poem> tag."""
transcluded_page_title = ""
text = textlib.removeDisabledParts(page.text)
poem_regex = re.compile(r'<poem>(.*?)</poem>', re.DOTALL)
match = poem_regex.search(text)
if match:
content = self.clean_template_content(match.group(1).strip())
return self.construct_poem_content(content), transcluded_page_title
transcluded_content, transcluded_page_title = self.handle_transclusion(page)
return transcluded_content, transcluded_page_title
def clean_template_content(self, text: str) -> str:
"""Remove template markup and return clean content."""
return re.sub(r'{{[^}]*}}', '', text).strip()
def construct_poem_content(self, content: str) -> str:
"""Construct the poem content from lines, limiting to 100 characters."""
lines = content.split('\n')
combined_content = lines[0].strip()
for line in lines[1:]:
line = line.strip()
if not line: # If there's a blank line, stop processing further
break
if len(combined_content) + len(line) + 1 <= 100: # +1 for the space
combined_content += ' ' + line
else:
break
return combined_content
def handle_transclusion(self, page: pywikibot.page.BasePage) -> tuple[str, str]:
"""Handle transcluded pages."""
transclusion_regex = re.compile(
r'<pages index="(?P<index>[^"]+)" (include|onlysection|fromsection)="(?P<section>[^"]+)" />')
match = transclusion_regex.search(page.text)
if match:
index = match.group('index')
section = match.group('section').split('-')[0] # Take the first part if it's a range
page_title = f'Stránka:{index}/{section}' if match.group(2) == "include" else f'Stránka:{index}'
transcluded_page = pywikibot.Page(pywikibot.Site(), page_title)
if transcluded_page.exists():
poem_content = self.extract_section_poem(transcluded_page, section)
return poem_content, page_title
return '', ''
def extract_section_poem(self, page: pywikibot.page.BasePage, section: str) -> str:
"""Extract the first <poem> after a specific section."""
text = textlib.removeDisabledParts(page.text)
section_begin_regex = re.compile(
rf'(<section begin="{section}".*?>|<section begin={section}.*?>|##\s*{section}\s*##)(.*?)<section end="{section}"|<section end={section}|##\s*{section}\s*##', re.DOTALL)
section_match = section_begin_regex.search(text)
if section_match:
section_text = section_match.group(2)
poem_regex = re.compile(r'<poem>(.*?)</poem>', re.DOTALL)
match = poem_regex.search(section_text)
if match:
content = self.clean_template_content(match.group(1).strip())
return self.construct_poem_content(content)
return ''
def add_claim_to_item(self, item: pywikibot.page.ItemPage, property_id: str, value: str, language: str) -> None:
"""Add a monolingual text claim to the item."""
claim_exists = False
shorter_claim_exists = False
shorter_claim = None
for claim in item.claims.get(property_id, []):
target = claim.getTarget()
if isinstance(target, WbMonolingualText):
if target.text == value:
claim_exists = True
break
if target.text.startswith(value) or value.startswith(target.text):
shorter_claim_exists = True
shorter_claim = claim
break
if claim_exists:
pywikibot.info(f'Skipping: Claim with the same value already exists.')
return
if shorter_claim_exists:
pywikibot.info(f'Removing shorter claim: {shorter_claim.getTarget().text}')
item.removeClaims([shorter_claim])
claim = pywikibot.Claim(self.repo, property_id)
target = WbMonolingualText(text=value, language=language)
claim.setTarget(target)
item.addClaim(claim)
pywikibot.info(f'Added claim {property_id} with value "{value}" in language "{language}"')
def main(*args: str) -> None:
gen = pg.GeneratorFactory()
local_args = pywikibot.handle_args(args)
for arg in local_args:
if not gen.handle_arg(arg):
pass # Ignore unhandled arguments
if not gen.gens:
pywikibot.error('No generator specified.')
return
generator = gen.getCombinedGenerator(preload=True)
bot = PoemHarvestRobot(generator=generator)
bot.run()
if __name__ == '__main__':
main()