skript na import incipitu - prvních veršů básně - na Wikidata. Vytvořeno s pomocí chatGPT na základě harvest_template.py. Neumí transkluzi, tato verze nemá ošetřeny vložené šablony. Viz též poem2.py

#!/usr/bin/env python3
# (C) JAnD, ChatGPT
# Distributed under the terms of MIT license.
import re
import signal
import sys

import pywikibot
from pywikibot import pagegenerators as pg
from pywikibot import textlib, WbMonolingualText
from pywikibot.exceptions import InvalidTitleError, NoPageError
from pywikibot.bot import WikidataBot

willstop = False

def _signal_handler(signum, frame) -> None:
    global willstop
    if not willstop:
        willstop = True
        pywikibot.info('Received ctrl-c. Finishing current item; '
                       'press ctrl-c again to abort.')
        raise KeyboardInterrupt

signal.signal(signal.SIGINT, _signal_handler)

class PoemHarvestRobot(WikidataBot):
    """A bot to add Wikidata claims from the content of <poem> tags."""

    def __init__(self, **kwargs) -> None:

    def setup(self) -> None:

    def treat_page_and_item(self,
                            page: pywikibot.page.BasePage | None,
                            item: pywikibot.page.ItemPage | None) -> None:
        """Process a single page/item."""
        if willstop:
            raise KeyboardInterrupt

        if page is None:

        assert page is self.current_page
        self.site = page.site

        # Skip page if it contains {{Forma}} or {{forma}} before <poem>
        # TODO: nikde není ošetřené "before"
        if re.search(r'\{\{[Ff]orma\b', page.text) and '<poem>' in page.text:
            pywikibot.info(f"Skipping page {page.title()} because it contains '{{Forma}}' or '{{forma}}' before <poem>")

        # Process the page content to find the first <poem> tag
        poem_content = self.extract_poem_content(page.text)
        if poem_content:
            self.add_claim_to_item(item, 'P1922', poem_content, 'cs')

    def extract_poem_content(self, text: str) -> str:
        """Extract content of the first <poem> tag."""
        poem_regex = re.compile(r'<poem>(.*?)</poem>', re.DOTALL)
        match = poem_regex.search(text)
        if match:
            content = match.group(1).strip()
            # Remove wikimarkup ([[links]]) and single quotes
            # TODO: je rozdíl mezi [[Odkaz]] a [[Odkaz|Popis]]
            content = re.sub(r'\[\[(.*?)\]\]', r'\1', content)
            content = content.replace("'", "")
            content = self.remove_templates(content)

            # Split content into lines
            lines = content.split('\n')
            result = lines[0].strip()

            # Process each line
            for line in lines[1:]:
                line = line.strip()
                if not line:
                    # End of paragraph, return the result
                    return result
                result += ' ' + line
                if line.endswith(('.', '!', '?')):
                    # End of sentence, return the result
                    return result

            # End of poem, return the result
            return result

        return ''

    def remove_templates(self, text: str) -> str:
        """Remove specified templates from text."""
        templates = ['Prostrkaně', 'uprostřed', 'vlevo', 'vpravo', 'verzálky']
        for template in templates:
            text = re.sub(r'\{\{' + re.escape(template) + r'[^}]*\}\}', '', text, flags=re.IGNORECASE)
        return text

    def add_claim_to_item(self, item: pywikibot.page.ItemPage, property_id: str, value: str, language: str) -> None:
        """Add a monolingual text claim to the item."""
        existing_claim = None
        for claim in item.claims.get(property_id, []):
            target = claim.getTarget()
            if target and target.text.startswith(value[:50]):
                existing_claim = claim

        if existing_claim:
            if len(existing_claim.getTarget().text) < len(value):
                target = WbMonolingualText(text=value, language=language)
                pywikibot.info(f'Updated existing claim {property_id} with new value "{value}"')
            if not existing_claim.sources:
                source_claim = self.getSource(self.site)
                pywikibot.info(f'Added source to existing claim {property_id} with value "{value}"')
            # Check for existing claims with the same start
            for claim in item.claims.get(property_id, []):
                if claim.getTarget().text.startswith(value[:50]):
                    if len(claim.getTarget().text) < len(value):
                        pywikibot.info(f'Removed shorter claim {property_id} with value "{claim.getTarget().text}"')

            claim = pywikibot.Claim(self.repo, property_id)
            target = WbMonolingualText(text=value, language=language)

            source_claim = self.getSource(self.site)

            pywikibot.info(f'Added claim {property_id} with value "{value}" in language "{language}"')

def main(*args: str) -> None:
    gen = pg.GeneratorFactory()

    local_args = pywikibot.handle_args(args)
    for arg in local_args:
        if not gen.handle_arg(arg):
            pass  # Ignore unhandled arguments

    if not gen.gens:
        pywikibot.error('No generator specified.')

    generator = gen.getCombinedGenerator(preload=True)

    bot = PoemHarvestRobot(generator=generator)

if __name__ == '__main__':