Uživatel:JAnDbot/poem2.py

skript na import incipitu - prvních veršů básně - na Wikidata. Vytvořeno s pomocí chatGPT na základě harvest_template.py. Zatím neúspěšný pokus o transkluzi, tato verze by mohla mít ošetřeny vložené šablony - netestováno.
import re
import signal
import sys

import pywikibot
from pywikibot import pagegenerators as pg
from pywikibot import textlib, WbMonolingualText
from pywikibot.exceptions import InvalidTitleError, NoPageError
from pywikibot.bot import WikidataBot

willstop = False

def _signal_handler(signum, frame) -> None:
    global willstop
    if not willstop:
        willstop = True
        pywikibot.info('Received ctrl-c. Finishing current item; '
                       'press ctrl-c again to abort.')
    else:
        raise KeyboardInterrupt

signal.signal(signal.SIGINT, _signal_handler)

class PoemHarvestRobot(WikidataBot):
    """A bot to add Wikidata claims from the content of <poem> tags."""

    def __init__(self, **kwargs) -> None:
        """Initializer."""
        super().__init__(**kwargs)

    def treat_page_and_item(self,
                            page: pywikibot.page.BasePage | None,
                            item: pywikibot.page.ItemPage | None) -> None:
        """Process a single page/item."""
        if willstop:
            raise KeyboardInterrupt

        if page is None:
            return

        assert page is self.current_page

        poem_content, transcluded_page_title = self.extract_poem_content(page)
        if poem_content:
            if transcluded_page_title:
                pywikibot.info('Obsah je transkludován')
                pywikibot.info(f'Transkludovaná stránka: {transcluded_page_title}')
            self.add_claim_to_item(item, 'P1922', poem_content, 'cs')

    def extract_poem_content(self, page: pywikibot.page.BasePage) -> tuple[str, str]:
        """Extract content of the first <poem> tag."""
        transcluded_page_title = ""
        text = textlib.removeDisabledParts(page.text)
        poem_regex = re.compile(r'<poem>(.*?)</poem>', re.DOTALL)
        match = poem_regex.search(text)
        if match:
            content = self.clean_template_content(match.group(1).strip())
            return self.construct_poem_content(content), transcluded_page_title
        
        transcluded_content, transcluded_page_title = self.handle_transclusion(page)
        return transcluded_content, transcluded_page_title

    def clean_template_content(self, text: str) -> str:
        """Remove template markup and return clean content."""
        return re.sub(r'{{[^}]*}}', '', text).strip()

    def construct_poem_content(self, content: str) -> str:
        """Construct the poem content from lines, limiting to 100 characters."""
        lines = content.split('\n')
        combined_content = lines[0].strip()
        for line in lines[1:]:
            line = line.strip()
            if not line:  # If there's a blank line, stop processing further
                break
            if len(combined_content) + len(line) + 1 <= 100:  # +1 for the space
                combined_content += ' ' + line
            else:
                break
        return combined_content

    def handle_transclusion(self, page: pywikibot.page.BasePage) -> tuple[str, str]:
        """Handle transcluded pages."""
        transclusion_regex = re.compile(
            r'<pages index="(?P<index>[^"]+)" (include|onlysection|fromsection)="(?P<section>[^"]+)" />')
        match = transclusion_regex.search(page.text)
        if match:
            index = match.group('index')
            section = match.group('section').split('-')[0]  # Take the first part if it's a range
            page_title = f'Stránka:{index}/{section}' if match.group(2) == "include" else f'Stránka:{index}'
            
            transcluded_page = pywikibot.Page(pywikibot.Site(), page_title)
            if transcluded_page.exists():
                poem_content = self.extract_section_poem(transcluded_page, section)
                return poem_content, page_title
        return '', ''

    def extract_section_poem(self, page: pywikibot.page.BasePage, section: str) -> str:
        """Extract the first <poem> after a specific section."""
        text = textlib.removeDisabledParts(page.text)
        section_begin_regex = re.compile(
            rf'(<section begin="{section}".*?>|<section begin={section}.*?>|##\s*{section}\s*##)(.*?)<section end="{section}"|<section end={section}|##\s*{section}\s*##', re.DOTALL)
        section_match = section_begin_regex.search(text)
        if section_match:
            section_text = section_match.group(2)
            poem_regex = re.compile(r'<poem>(.*?)</poem>', re.DOTALL)
            match = poem_regex.search(section_text)
            if match:
                content = self.clean_template_content(match.group(1).strip())
                return self.construct_poem_content(content)
        return ''

    def add_claim_to_item(self, item: pywikibot.page.ItemPage, property_id: str, value: str, language: str) -> None:
        """Add a monolingual text claim to the item."""
        claim_exists = False
        shorter_claim_exists = False
        shorter_claim = None

        for claim in item.claims.get(property_id, []):
            target = claim.getTarget()
            if isinstance(target, WbMonolingualText):
                if target.text == value:
                    claim_exists = True
                    break
                if target.text.startswith(value) or value.startswith(target.text):
                    shorter_claim_exists = True
                    shorter_claim = claim
                    break

        if claim_exists:
            pywikibot.info(f'Skipping: Claim with the same value already exists.')
            return

        if shorter_claim_exists:
            pywikibot.info(f'Removing shorter claim: {shorter_claim.getTarget().text}')
            item.removeClaims([shorter_claim])

        claim = pywikibot.Claim(self.repo, property_id)
        target = WbMonolingualText(text=value, language=language)
        claim.setTarget(target)
        item.addClaim(claim)
        pywikibot.info(f'Added claim {property_id} with value "{value}" in language "{language}"')

def main(*args: str) -> None:
    gen = pg.GeneratorFactory()

    local_args = pywikibot.handle_args(args)
    for arg in local_args:
        if not gen.handle_arg(arg):
            pass  # Ignore unhandled arguments

    if not gen.gens:
        pywikibot.error('No generator specified.')
        return

    generator = gen.getCombinedGenerator(preload=True)

    bot = PoemHarvestRobot(generator=generator)
    bot.run()

if __name__ == '__main__':
    main()