scripts/reflinks.py - pywikibot/core - Gitiles

 #!/usr/bin/env python3
 """Fetch and add titles for bare links in references.

 This bot will search for references which are only made of a link
 without title (i.e. ``<ref>[https://www.google.fr/]</ref>`` or
 ``<ref>https://www.google.fr/</ref>``) and will fetch the html title
 from the link to use it as the title of the wiki link in the reference,
 i.e.

 .. code:: wikitext

    <ref>[https://www.google.fr/search?q=test test - Google Search]</ref>

 The bot checks every 20 edits a special stop page. If the page has been
 edited, it stops.

 As it uses it, you need to configure reflinks.py for your wiki, or it
 will not work.

 pdfinfo is needed for parsing pdf titles.

 The following parameters are supported:

 -xml:dump.xml  Should be used instead of a simple page fetching method
                from pagegenerators.py for performance and load issues

 -xmlstart      Page to start with when using an XML dump

 This script is a :class:`ConfigParserBot <bot.ConfigParserBot>`. The
 following options can be set within a settings file which is scripts.ini
 by default:

 -always        Doesn't ask every time whether the bot should make the
                change. Do it always.

 -limit:n       [int] Stops after n edits

 -ignorepdf     Do not handle PDF files (handy if you use Windows and
                can't get pdfinfo)

 -summary       [str] Use a custom edit summary. Otherwise it uses the
                default one from translatewiki

 The following generators and filters are supported:

 &params;
 """
 # (C) Pywikibot team, 2008-2024
 #
 # Distributed under the terms of the MIT license.
 #
 from __future__ import annotations

 import http.client as httplib
 import itertools
 import os
 import re
 import subprocess
 import tempfile
 from contextlib import suppress
 from enum import IntEnum
 from functools import partial
 from http import HTTPStatus
 from pathlib import Path
 from textwrap import shorten

 import pywikibot
 from pywikibot import comms, config, i18n, pagegenerators, textlib
 from pywikibot.backports import removeprefix
 from pywikibot.bot import ConfigParserBot, ExistingPageBot, SingleSiteBot
 from pywikibot.comms.http import get_charset_from_content_type
 from pywikibot.exceptions import ServerError
 from pywikibot.pagegenerators import (
     XMLDumpPageGenerator as _XMLDumpPageGenerator,
 )
 from pywikibot.textlib import replaceExcept
 from pywikibot.tools.chars import string2html


 try:
     from scripts import noreferences
 except ModuleNotFoundError:
     from pywikibot_scripts import noreferences


 docuReplacements = {
     '&params;': pagegenerators.parameterHelp
 }

 localized_msg = ('fr', 'it', 'pl')  # localized message at MediaWiki

 # localized message at specific Wikipedia site
 # should be moved to MediaWiki Pywikibot manual


 stop_page = {
     'fr': 'Utilisateur:DumZiBoT/EditezCettePagePourMeStopper',
     'da': 'Bruger:DumZiBoT/EditThisPageToStopMe',
     'de': 'Benutzer:DumZiBoT/EditThisPageToStopMe',
     'fa': 'کاربر:Amirobot/EditThisPageToStopMe',
     'it': 'Utente:Marco27Bot/EditThisPageToStopMe',
     'ko': '사용자:GrassnBreadRefBot/EditThisPageToStopMe1',
     'he': 'User:Matanyabot/EditThisPageToStopMe',
     'hu': 'User:Damibot/EditThisPageToStopMe',
     'en': 'User:DumZiBoT/EditThisPageToStopMe',
     'pl': 'Wikipedysta:MastiBot/EditThisPageToStopMe',
     'ru': 'User:Rubinbot/EditThisPageToStopMe',
     'ur': 'صارف:Shuaib-bot/EditThisPageToStopMe',
     'zh': 'User:Sz-iwbot',
 }

 deadLinkTag = {
     'ar': '[%s] {{وصلة مكسورة}}',
     'fr': '[%s] {{lien mort}}',
     'da': '[%s] {{dødt link}}',
     'fa': '[%s] {{پیوند مرده}}',
     'he': '{{קישור שבור}}',
     'hi': '[%s] {{Dead link}}',
     'hu': '[%s] {{halott link}}',
     'ko': '[%s] {{죽은 바깥 고리}}',
     'es': '{{enlace roto2|%s}}',
     'it': '{{Collegamento interrotto|%s}}',
     'en': '[%s] {{dead link}}',
     'pl': '[%s] {{Martwy link}}',
     'ru': '[%s] {{Недоступная ссылка}}',
     'sr': '[%s] {{dead link}}',
     'ur': '[%s] {{مردہ ربط}}',
 }


 soft404 = re.compile(
     r'\D404(\D|\Z)|error|errdoc|Not.{0,3}Found|sitedown|eventlog',
     re.IGNORECASE)
 # matches an URL at the index of a website
 dirIndex = re.compile(
     r'\w+://[^/]+/((default|index)\.'
     r'(asp|aspx|cgi|htm|html|phtml|mpx|mspx|php|shtml|var))?',
     re.IGNORECASE)
 # Extracts the domain name
 domain = re.compile(r'^(\w+)://(?:www\.|)([^/]+)')

 globalbadtitles = r"""
 # is
 (test|
 # starts with
     ^\W*(
             register
             |registration
             |(sign|log)[ \-]?in
             |subscribe
             |sign[ \-]?up
             |log[ \-]?on
             |untitled[ ]?(document|page|\d+|$)
             |404[ ]
         ).*
 # anywhere
     |.*(
             403[ ]forbidden
             |(404|page|file|information|resource).*not([ ]*be)?[ ]*
             (available|found)
             |are[ ](?:.+?[ ])?robot
             |site.*disabled
             |error[ ]404
             |error.+not[ ]found
             |not[ ]found.+error
             |404[ ]error
             |\D404\D
             |check[ ]browser[ ]settings
             |log[ \-]?(on|in)[ ]to
             |site[ ]redirection
      ).*
 # ends with
     |.*(
             register
             |registration
             |(sign|log)[ \-]?in
             |subscribe|sign[ \-]?up
             |log[ \-]?on
         )\W*$
 )
 """
 # Language-specific bad titles
 badtitles = {
     'en': '',
     'fr': '.*(404|page|site).*en +travaux.*',
     'es': '.*sitio.*no +disponible.*',
     'it': '((pagina|sito) (non trovat[ao]|inesistente)|accedi|errore)',
     'ru': '.*([Сс]траница.*(не[ ]*найдена|отсутствует)|Вы.*человек).*',
 }

 # Regex that match bare references
 linksInRef = re.compile(
     # bracketed URLs
     r'(?i)<ref(?P<name>[^>]*)>\s*\[?(?P<url>(?:http|https)://(?:'
     # unbracketed with()
     r'^\[\]\s<>"]+\([^\[\]\s<>"]+[^\[\]\s\.:;\\,<>\?"]+|'
     # unbracketed without ()
     r'[^\[\]\s<>"]+[^\[\]\s\)\.:;\\,<>\?"]+|[^\[\]\s<>"]+))'
     r'[!?,\s]*\]?\s*</ref>')

 # Download this file :
 # http://www.twoevils.org/files/wikipedia/404-links.txt.gz
 # ( maintained by User:Dispenser )
 listof404pages = '404-links.txt'

 XmlDumpPageGenerator = partial(
     _XMLDumpPageGenerator, text_predicate=linksInRef.search)


 class RefLink:

     """Container to handle a single bare reference."""

     def __init__(self, link, name, site=None) -> None:
         """Initializer."""
         self.name = name
         self.link = link
         self.site = site or pywikibot.Site()
         self.comment = i18n.twtranslate(self.site, 'reflinks-comment')
         self.url = re.sub('#.*', '', self.link)
         self.title = None

     def refTitle(self) -> str:
         """Return the <ref> with its new title."""
         return (f'<ref{self.name}>[{self.link} {self.title}'
                 f'<!-- {self.comment} -->]</ref>')

     def refLink(self) -> str:
         """No title has been found, return the unbracketed link."""
         return f'<ref{self.name}>{self.link}</ref>'

     def refDead(self):
         """Dead link, tag it with a {{dead link}}."""
         tag = i18n.translate(self.site, deadLinkTag)
         if not tag:
             dead_link = self.refLink()
         else:
             if '%s' in tag:
                 tag %= self.link
             dead_link = f'<ref{self.name}>{tag}</ref>'
         return dead_link

     def transform(self, ispdf: bool = False) -> None:
         """Normalize the title."""
         # convert html entities
         if not ispdf:
             self.title = pywikibot.html2unicode(self.title)
         self.title = re.sub(r'-+', '-', self.title)
         # remove formatting, i.e long useless strings
         self.title = re.sub(r'[\.+\-=]{4,}', ' ', self.title)
         # remove \n and \r and unicode spaces from titles
         self.title = re.sub(r'\s', ' ', self.title)
         # remove extra whitespaces
         # remove leading and trailing ./;/,/-/_/+/ /
         self.title = re.sub(r' +', ' ', self.title.strip(r'=.;,-+_ '))

         self.avoid_uppercase()
         # avoid closing the link before the end
         self.title = self.title.replace(']', '&#93;')
         # avoid multiple } being interpreted as a template inclusion
         self.title = self.title.replace('}}', '}&#125;')
         # prevent multiple quotes being interpreted as '' or '''
         self.title = self.title.replace("''", "'&#39;")
         self.title = string2html(self.title, self.site.encoding())
         # TODO : remove HTML when both opening and closing tags are included

     def avoid_uppercase(self) -> None:
         """Convert to title()-case if title is 70% uppercase characters.

         Skip title that has less than 6 characters.
         """
         if len(self.title) <= 6:
             return
         nb_upper = 0
         nb_letter = 0
         for letter in self.title:
             if letter.isupper():
                 nb_upper += 1
             if letter.isalpha():
                 nb_letter += 1
             if letter.isdigit():
                 return
         if nb_upper / (nb_letter + 1) > 0.7:
             self.title = self.title.title()


 class IX(IntEnum):

     """Index class for references data."""

     name = 0
     reflist = 1
     quoted = 2
     change_needed = 3


 class DuplicateReferences:

     """Helper to de-duplicate references in text.

     When some references are duplicated in an article, name the first,
     and remove the content of the others
     """

     def __init__(self, site=None) -> None:
         """Initializer."""
         if not site:
             site = pywikibot.Site()

         # Match references
         self.REFS = re.compile(
             r'(?is)<ref(?P<params>[^>/]*)>(?P<content>.*?)</ref>')
         fmt = r'(?i){0}\s*=\s*(?P<quote>["\']?)\s*(?P<{0}>.+)\s*(?P=quote)'
         self.NAMES = re.compile(fmt.format('name'))
         self.GROUPS = re.compile(fmt.format('group'))
         self.autogen = i18n.twtranslate(site, 'reflinks-autogen')

     def process(self, text):
         """Process the page."""
         # keys are ref groups
         # values are a dict where :
         #   keys are ref content
         #   values are [name, [list of full ref matches],
         #               quoted, need_to_change]
         found_refs = {}
         found_ref_names = set()
         # Replace key by [value, quoted]
         named_repl = {}

         # Parse references
         for match in self.REFS.finditer(text):
             content = match['content']
             if not content.strip():
                 continue

             params = match['params']
             group = self.GROUPS.search(params) or ''
             if group not in found_refs:
                 found_refs[group] = {}

             groupdict = found_refs[group]
             if content in groupdict:
                 v = groupdict[content]
                 v[IX.reflist].append(match.group())
             else:
                 v = [None, [match.group()], False, False]

             found = self.NAMES.search(params)
             if found:
                 quoted = found['quote'] in ['"', "'"]
                 name = found['name']

                 if not v[IX.name]:
                     # First name associated with this content
                     if name not in found_ref_names:
                         # first time ever we meet this name
                         v[IX.quoted] = quoted
                         v[IX.name] = name
                     else:
                         # if has_key, means that this name is used
                         # with another content. We'll need to change it
                         v[IX.change_needed] = True
                 elif v[IX.name] != name:
                     named_repl[name] = [v[IX.name], v[IX.quoted]]

                 found_ref_names.add(name)
             groupdict[content] = v

         # Find used autogenerated numbers
         used_numbers = set()
         for name in found_ref_names:
             number = removeprefix(name, self.autogen)
             with suppress(ValueError):
                 used_numbers.add(int(number))

         # generator to give the next free number for autogenerating names
         free_number = (str(i) for i in itertools.count(start=1)
                        if i not in used_numbers)

         # Fix references
         for groupname, references in found_refs.items():
             group = f'group="{groupname}" ' if groupname else ''

             for ref, v in references.items():
                 if len(v[IX.reflist]) == 1 and not v[IX.change_needed]:
                     continue

                 name = v[IX.name]
                 if not name:
                     name = f'"{self.autogen}{next(free_number)}"'
                 elif v[IX.quoted]:
                     name = f'"{name}"'

                 named = f'<ref {group}{name=!s}>{ref}</ref>'
                 text = text.replace(v[IX.reflist][0], named, 1)

                 # make sure that the first (named ref) is not removed later
                 pos = text.index(named) + len(named)
                 header = text[:pos]
                 end = text[pos:]

                 # replace multiple identical references with repeated ref
                 repeated_ref = f'<ref {group}{name=!s} />'
                 for ref in v[IX.reflist][1:]:
                     # Don't replace inside templates (T266411)
                     end = replaceExcept(end, re.escape(ref), repeated_ref,
                                         exceptions=['template'])
                 text = header + end

         # Fix references with different names
         for ref, v in named_repl.items():
             # TODO : Support ref groups
             name = v[IX.name]
             if v[IX.reflist]:
                 name = f'"{name}"'

             text = re.sub(rf'<ref name\s*=\s*(?P<quote>["\']?)\s*{ref}\s*'
                           r'(?P=quote)\s*/>', f'<ref {name=!s} />', text)
         return text


 class ReferencesRobot(SingleSiteBot, ConfigParserBot, ExistingPageBot):

     """References bot.

     .. versionchanged:: 7.0
        ReferencesRobot is a ConfigParserBot
     """

     use_redirects = False

     update_options = {
         'ignorepdf': False,
         'limit': 0,  # stop after n modified pages
         'summary': '',
     }

     def __init__(self, **kwargs) -> None:
         """Initializer."""
         super().__init__(**kwargs)
         self._use_fake_user_agent = config.fake_user_agent_default.get(
             'reflinks', False)
         # Check
         manual = 'mw:Manual:Pywikibot/refLinks'
         code = None
         for alt in [self.site.code, *i18n._altlang(self.site.code)]:
             if alt in localized_msg:
                 code = alt
                 break
         if code:
             manual += f'/{code}'

         if self.opt.summary:
             self.msg = self.opt.summary
         else:
             self.msg = i18n.twtranslate(self.site, 'reflinks-msg', locals())

         local = i18n.translate(self.site, badtitles)
         bad = f'({globalbadtitles}|{local})' if local else globalbadtitles

         self.titleBlackList = re.compile(bad, re.I | re.S | re.X)
         self.norefbot = noreferences.NoReferencesBot(verbose=False)
         self.deduplicator = DuplicateReferences(self.site)

         self.site_stop_page = i18n.translate(self.site, stop_page)
         if self.site_stop_page:
             self.stop_page = pywikibot.Page(self.site, self.site_stop_page)
             if self.stop_page.exists():
                 self.stop_page_rev_id = self.stop_page.latest_revision_id
             else:
                 pywikibot.warning(
                     f'The stop page {self.stop_page.title(as_link=True)} does'
                     ' not exist'
                 )

         # Regex to grasp content-type meta HTML tag in HTML source
         self.META_CONTENT = re.compile(
             br'(?i)<meta[^>]*(?:content\-type|charset)[^>]*>')
         # Extract html title from page
         self.TITLE = re.compile(r'(?is)(?<=<title>).*?(?=</title>)')
         # Matches content inside <script>/<style>/HTML comments
         self.NON_HTML = re.compile(
             br'(?is)<script[^>]*>.*?</script>|<style[^>]*>.*?</style>|'
             br'<!--.*?-->|<!\[CDATA\[.*?\]\]>')

         # Authorized mime types for HTML pages
         self.MIME = re.compile(
             r'application/(?:xhtml\+xml|xml)|text/(?:ht|x)ml')

     @staticmethod
     def httpError(err_num, link, pagetitleaslink) -> None:
         """Log HTTP Error."""
         pywikibot.stdout(
             f'HTTP error ({err_num}) for {link} on {pagetitleaslink}')

     @staticmethod
     def getPDFTitle(ref, response) -> None:
         """Use pdfinfo to retrieve title from a PDF."""
         # pdfinfo is Unix-only
         pywikibot.info('Reading PDF file...')
         infile = None
         try:
             fd, infile = tempfile.mkstemp()
             urlobj = os.fdopen(fd, 'w+')
             urlobj.write(response.text)
             pdfinfo_out = subprocess.Popen([r'pdfinfo', '/dev/stdin'],
                                            stdin=urlobj,
                                            stdout=subprocess.PIPE,
                                            stderr=subprocess.PIPE,
                                            shell=False).communicate()[0]
         except ValueError:
             pywikibot.info('pdfinfo value error.')
         except OSError:
             pywikibot.info('pdfinfo OS error.')
         except Exception as e:  # Ignore errors
             pywikibot.info('PDF processing error.')
             pywikibot.error(e)
         else:
             for aline in pdfinfo_out.splitlines():
                 if isinstance(aline, bytes):
                     aline = aline.decode()
                 if aline.lower().startswith('title'):
                     ref.title = ' '.join(aline.split()[1:])
                     if ref.title:
                         pywikibot.info('title: ' + ref.title)
                         break
             pywikibot.info('PDF done.')
         finally:
             if infile is not None:
                 urlobj.close()
                 os.unlink(infile)

     def setup(self) -> None:
         """Read dead links from file."""
         try:
             path = Path(listof404pages)
             self.dead_links = path.read_text(encoding='latin_1')
         except OSError:
             raise NotImplementedError(
                 '404-links.txt is required for reflinks.py\n'
                 'You need to download\n'
                 'http://www.twoevils.org/files/wikipedia/404-links.txt.gz\n'
                 'and to unzip it in the same directory')

     def skip_page(self, page) -> bool:
         """Skip unwanted pages."""
         if super().skip_page(page):
             return True

         if not page.has_permission():
             pywikibot.warning(f"You can't edit page {page}")
             return True

         return False

     def treat(self, page) -> None:
         """Process one page."""
         # Load the page's text from the wiki
         new_text = page.text
         raw_text = textlib.removeDisabledParts(new_text)
         # for each link to change
         for match in linksInRef.finditer(raw_text):
             link = match['url']
             if 'jstor.org' in link:
                 # TODO: Clean URL blacklist
                 continue

             ref = RefLink(link, match['name'], site=self.site)

             try:
                 r = comms.http.fetch(
                     ref.url, use_fake_user_agent=self._use_fake_user_agent)

                 # Try to get Content-Type from server
                 content_type = r.headers.get('content-type')
                 if content_type and not self.MIME.search(content_type):
                     if ref.link.lower().endswith('.pdf') \
                        and not self.opt.ignorepdf:
                         # If file has a PDF suffix
                         self.getPDFTitle(ref, r)
                     else:
                         pywikibot.info(f'<<lightyellow>>WARNING<<default>> : '
                                        f'media : {ref.link} ')

                     if not ref.title:
                         repl = ref.refLink()
                     elif not re.match('(?i) *microsoft (word|excel|visio)',
                                       ref.title):
                         ref.transform(ispdf=True)
                         repl = ref.refTitle()
                     else:
                         pywikibot.info(f'<<lightyellow>>WARNING<<default>> : '
                                        f'PDF title blacklisted : {ref.title} ')
                         repl = ref.refLink()

                     new_text = new_text.replace(match.group(), repl)
                     continue

                 # Get the real url where we end (http redirects !)
                 redir = r.url
                 if redir != ref.link \
                    and domain.findall(redir) == domain.findall(link):
                     if soft404.search(redir) \
                        and not soft404.search(ref.link):
                         pywikibot.info(f'<<lightyellow>>WARNING<<default>> : '
                                        f'Redirect 404 : {ref.link} ')
                         continue

                     if dirIndex.fullmatch(redir) \
                        and not dirIndex.fullmatch(ref.link):
                         pywikibot.info(f'<<lightyellow>>WARNING<<default>> : '
                                        f'Redirect to root : {ref.link} ')
                         continue

                 if r.status_code != HTTPStatus.OK:
                     pywikibot.stdout(
                         f'HTTP error ({r.status_code}) for {ref.url} on '
                         f'{page.title(as_link=True)}'
                     )
                     # 410 Gone, indicates that the resource has been
                     # purposely removed
                     if r.status_code == HTTPStatus.GONE \
                        or (r.status_code == HTTPStatus.NOT_FOUND
                            and f'\t{ref.url}\t' in self.dead_links):
                         repl = ref.refDead()
                         new_text = new_text.replace(match.group(), repl)
                     continue

             except UnicodeError:
                 # example:
                 # http://www.adminet.com/jo/20010615¦/ECOC0100037D.html
                 # in [[fr:Cyanure]]
                 pywikibot.info(
                     f'<<lightred>>Bad link<<default>> : {ref.url} in {page}')
                 continue

             except (ValueError,  # urllib3.LocationParseError derives from it
                     OSError,
                     httplib.error,
                     ServerError) as err:
                 pywikibot.info(f"{err.__class__.__name__}: Can't retrieve url "
                                f'{ref.url}: {err}')
                 continue

             linkedpagetext = r.content
             # remove <script>/<style>/comments/CDATA tags
             linkedpagetext = self.NON_HTML.sub(b'', linkedpagetext)

             meta_content = self.META_CONTENT.search(linkedpagetext)
             encoding = None
             if content_type:
                 encoding = get_charset_from_content_type(content_type)

             if meta_content:
                 tag = None
                 # use a dict to keep the order
                 encodings = {encoding: None} if encoding else {}
                 encodings.update(dict.fromkeys(page.site.encodings()))

                 for enc in encodings:
                     with suppress(UnicodeDecodeError, LookupError):
                         tag = meta_content.group().decode(enc)
                         break

                 # Prefer the content-type from the HTTP header
                 if not content_type and tag:
                     content_type = tag
                 if not encoding:
                     encoding = get_charset_from_content_type(tag)

             if encoding:
                 r.encoding = encoding

             if not content_type:
                 pywikibot.info('No content-type found for ' + ref.link)
                 continue

             if not self.MIME.search(content_type):
                 pywikibot.info(f'<<lightyellow>>WARNING<<default>> : media : '
                                f'{ref.link} ')
                 repl = ref.refLink()
                 new_text = new_text.replace(match.group(), repl)
                 continue

             # Retrieves the first non empty string inside <title> tags
             for m in self.TITLE.finditer(r.text):
                 t = m.group()
                 if t:
                     ref.title = t
                     ref.transform()
                     if ref.title:
                         break

             if not ref.title:
                 repl = ref.refLink()
                 new_text = new_text.replace(match.group(), repl)
                 pywikibot.info(f'{ref.link} : No title found...')
                 continue

             if self.titleBlackList.match(ref.title):
                 repl = ref.refLink()
                 new_text = new_text.replace(match.group(), repl)
                 pywikibot.info(f'<<lightred>>WARNING<<default>> {ref.link} : '
                                f'Blacklisted title ({ref.title})')
                 continue

             # Truncate long titles. 175 is arbitrary
             ref.title = shorten(ref.title, width=178, placeholder='...')

             repl = ref.refTitle()
             new_text = new_text.replace(match.group(), repl)

         # Add <references/> when needed, but ignore templates !
         if page.namespace != 10 and self.norefbot.lacksReferences(new_text):
             new_text = self.norefbot.addReferences(new_text)

         new_text = self.deduplicator.process(new_text)
         old_text = page.text

         if old_text == new_text:
             return

         self.userPut(page, old_text, new_text, summary=self.msg,
                      ignore_save_related_errors=True,
                      ignore_server_errors=True)

         if not self.counter['write']:
             return

         if self.opt.limit and self.counter['write'] >= self.opt.limit:
             pywikibot.info(f'Edited {self.opt.limit} pages, stopping.')
             self.generator.close()

         if self.site_stop_page and self.counter['write'] % 20 == 0:
             self.stop_page = pywikibot.Page(self.site, self.site_stop_page)
             if self.stop_page.exists():
                 pywikibot.info('<<lightgreen>>Checking stop page...')
                 actual_rev = self.stop_page.latest_revision_id
                 if actual_rev != self.stop_page_rev_id:
                     pywikibot.info(f'{self.stop_page} has been edited: '
                                    f'Someone wants us to stop.')
                     self.generator.close()


 def main(*args: str) -> None:
     """Process command line arguments and invoke bot.

     If args is an empty list, sys.argv is used.

     :param args: command line arguments
     """
     xml_filename = None
     xml_start = None
     options = {}
     generator = None

     # Process global args and prepare generator args parser
     local_args = pywikibot.handle_args(args)
     gen_factory = pagegenerators.GeneratorFactory()

     for arg in local_args:
         opt, _, value = arg.partition(':')
         if opt in ('-summary', '-limit'):
             options[opt[1:]] = value
         elif opt in ('-always', '-ignorepdf'):
             options[opt[1:]] = True
         elif opt == '-xmlstart':
             xml_start = value or pywikibot.input(
                 'Please enter the dumped article to start with:')
         elif opt == '-xml':
             xml_filename = value or pywikibot.input(
                 "Please enter the XML dump's filename:")
         else:
             gen_factory.handle_arg(arg)

     if xml_filename:
         generator = XmlDumpPageGenerator(xml_filename, xml_start,
                                          gen_factory.namespaces)
     if not generator:
         generator = gen_factory.getCombinedGenerator()
     if not generator:
         pywikibot.bot.suggest_help(missing_generator=True)
         return
     if not gen_factory.nopreload:
         generator = pagegenerators.PreloadingGenerator(generator)
     generator = pagegenerators.RedirectFilterPageGenerator(generator)
     bot = ReferencesRobot(generator=generator, **options)
     bot.run()


 if __name__ == '__main__':
     main()