Uživatel:JAnDbot/poem.py

skript na import incipitu - prvních veršů básně - na Wikidata. Vytvořeno s pomocí chatGPT na základě harvest_template.py. Neumí transkluzi, tato verze nemá ošetřeny vložené šablony. Viz též poem2.py
#!/usr/bin/env python3
#
# (C) JAnD, ChatGPT
#
# Distributed under the terms of MIT license.
#
import re
import signal
import sys

import pywikibot
from pywikibot import pagegenerators as pg
from pywikibot import textlib, WbMonolingualText
from pywikibot.exceptions import InvalidTitleError, NoPageError
from pywikibot.bot import WikidataBot

willstop = False

def _signal_handler(signum, frame) -> None:
    global willstop
    if not willstop:
        willstop = True
        pywikibot.info('Received ctrl-c. Finishing current item; '
                       'press ctrl-c again to abort.')
    else:
        raise KeyboardInterrupt

signal.signal(signal.SIGINT, _signal_handler)

class PoemHarvestRobot(WikidataBot):
    """A bot to add Wikidata claims from the content of <poem> tags."""

    def __init__(self, **kwargs) -> None:
        """Initializer."""
        super().__init__(**kwargs)

    def setup(self) -> None:
        self.cacheSources()

    def treat_page_and_item(self,
                            page: pywikibot.page.BasePage | None,
                            item: pywikibot.page.ItemPage | None) -> None:
        """Process a single page/item."""
        if willstop:
            raise KeyboardInterrupt

        if page is None:
            return

        assert page is self.current_page
        self.site = page.site

        # Skip page if it contains {{Forma}} or {{forma}} before <poem>
        # TODO: nikde není ošetřené "before"
        if re.search(r'\{\{[Ff]orma\b', page.text) and '<poem>' in page.text:
            pywikibot.info(f"Skipping page {page.title()} because it contains '{{Forma}}' or '{{forma}}' before <poem>")
            return

        # Process the page content to find the first <poem> tag
        poem_content = self.extract_poem_content(page.text)
        if poem_content:
            self.add_claim_to_item(item, 'P1922', poem_content, 'cs')

    def extract_poem_content(self, text: str) -> str:
        """Extract content of the first <poem> tag."""
        poem_regex = re.compile(r'<poem>(.*?)</poem>', re.DOTALL)
        match = poem_regex.search(text)
        if match:
            content = match.group(1).strip()
            # Remove wikimarkup ([[links]]) and single quotes
            # TODO: je rozdíl mezi [[Odkaz]] a [[Odkaz|Popis]]
            content = re.sub(r'\[\[(.*?)\]\]', r'\1', content)
            content = content.replace("'", "")
            content = self.remove_templates(content)

            # Split content into lines
            lines = content.split('\n')
            result = lines[0].strip()

            # Process each line
            for line in lines[1:]:
                line = line.strip()
                if not line:
                    # End of paragraph, return the result
                    return result
                result += ' ' + line
                if line.endswith(('.', '!', '?')):
                    # End of sentence, return the result
                    return result

            # End of poem, return the result
            return result

        return ''

    def remove_templates(self, text: str) -> str:
        """Remove specified templates from text."""
        templates = ['Prostrkaně', 'uprostřed', 'vlevo', 'vpravo', 'verzálky']
        for template in templates:
            text = re.sub(r'\{\{' + re.escape(template) + r'[^}]*\}\}', '', text, flags=re.IGNORECASE)
        return text

    def add_claim_to_item(self, item: pywikibot.page.ItemPage, property_id: str, value: str, language: str) -> None:
        """Add a monolingual text claim to the item."""
        existing_claim = None
        for claim in item.claims.get(property_id, []):
            target = claim.getTarget()
            if target and target.text.startswith(value[:50]):
                existing_claim = claim
                break

        if existing_claim:
            if len(existing_claim.getTarget().text) < len(value):
                target = WbMonolingualText(text=value, language=language)
                existing_claim.changeTarget(target)
                pywikibot.info(f'Updated existing claim {property_id} with new value "{value}"')
            if not existing_claim.sources:
                source_claim = self.getSource(self.site)
                existing_claim.addSource(source_claim)
                pywikibot.info(f'Added source to existing claim {property_id} with value "{value}"')
        else:
            # Check for existing claims with the same start
            for claim in item.claims.get(property_id, []):
                if claim.getTarget().text.startswith(value[:50]):
                    if len(claim.getTarget().text) < len(value):
                        item.removeClaim(claim)
                        pywikibot.info(f'Removed shorter claim {property_id} with value "{claim.getTarget().text}"')
                    else:
                        return

            claim = pywikibot.Claim(self.repo, property_id)
            target = WbMonolingualText(text=value, language=language)
            claim.setTarget(target)

            source_claim = self.getSource(self.site)
            claim.addSource(source_claim)

            item.addClaim(claim)
            pywikibot.info(f'Added claim {property_id} with value "{value}" in language "{language}"')


def main(*args: str) -> None:
    gen = pg.GeneratorFactory()

    local_args = pywikibot.handle_args(args)
    for arg in local_args:
        if not gen.handle_arg(arg):
            pass  # Ignore unhandled arguments

    if not gen.gens:
        pywikibot.error('No generator specified.')
        return

    generator = gen.getCombinedGenerator(preload=True)

    bot = PoemHarvestRobot(generator=generator)
    bot.run()


if __name__ == '__main__':
    main()