Szerkesztő:Cherybot/archivebot hu.py
Megjelenés
Az aktulisan használt változat: Szerkesztő:Atobot/archivebot hu.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
archivebot.py - discussion page archiving bot.
usage:
python archivebot.py [OPTIONS] TEMPLATE_PAGE
Bot examines backlinks (Special:Whatlinkshere) to TEMPLATE_PAGE.
Then goes through all pages (unless a specific page specified using options)
and archives old discussions. This is done by breaking a page into threads,
then scanning each thread for timestamps. Threads older than a specified
treshold are then moved to another page (the archive), which can be named
either basing on the thread's name or then name can contain a counter which
will be incremented when the archive reaches a certain size.
Options:
-h, --help show this help message and exit
-f FILE, --file=FILE load list of pages from FILE
-p PAGE, --page=PAGE archive a single PAGE
-n NAMESPACE, --namespace=NAMESPACE
only archive pages from a given namespace
-s SALT, --salt=SALT specify salt
-F, --force override security options
-c PAGE, --calc=PAGE calculate key for PAGE and exit
-l LOCALE, --locale=LOCALE
switch to locale LOCALE
"""
#
# (C) Misza13, 2006-2007
#
# Distributed under the terms of the MIT license.
#
__version__ = '$Id: archivebot.py 4946 2008-01-29 14:58:25Z wikipedian $'
#
import wikipedia, pagegenerators
Site = wikipedia.getSite()
import re, time, locale, traceback, string
try: #Get a constructor for the MD5 hash object
import hashlib
new_hash = hashlib.md5
except ImportError: #Old python?
import md5
new_hash = md5.md5
language = Site.language()
messages = {
'_default': {
'ArchiveFull': u'(ARCHIVE FULL)',
'InitialArchiveHeader': u'{{talkarchive}}',
'PageSummary': u'Archiving %(count)d thread(s) (%(why)s) to %(archives)s.',
'ArchiveSummary': u'Archiving %(count)d thread(s) from [[%(from)s]].',
'OlderThanSummary': u'older than',
},
'pl': {
'ArchiveFull': u'(ARCHIWUM PEŁNE)',
'InitialArchiveHeader': u'{{archiwum}}',
'PageSummary': u'Archiwizacja %(count)d wątków (%(why)s) do %(archives)s.',
'ArchiveSummary': u'Archiwizacja %(count)d wątków z [[%(from)s]].',
'OlderThanSummary': u'starsze niż',
},
'sv': {
'ArchiveFull': u'(ARKIV FULLT)',
'InitialArchiveHeader': u'{{arkiv}}',
'PageSummary': u'Arkiverar %(count)d trådar (%(why)s) till %(archives)s.',
'ArchiveSummary': u'Arkiverar %(count)d trådar från [[%(from)s]].',
'OlderThanSummary': u'äldre än',
},
'hu': {
'ArchiveFull': u'(MEGTELT)',
'InitialArchiveHeader': u'{{archív lap}}',
'PageSummary': u'%(count)d szakasz archiválása (%(why)s) a %(archives)s lapra',
'ArchiveSummary': u'%(count)d szakasz archiválása a [[%(from)s]] lapról',
'OlderThanSummary': u'régebbi, mint',
},
}
def message(key, lang=Site.language()):
if not messages.has_key(lang):
lang = '_default'
return messages[lang][key]
class MalformedConfigError(wikipedia.Error):
"""There is an error in the configuration template."""
class MissingConfigError(wikipedia.Error):
"""The config is missing in the header (either it's in one of the threads
or transcluded from another page)."""
class AlgorithmError(MalformedConfigError):
"""Invalid specification of archiving algorithm."""
class ArchiveSecurityError(wikipedia.Error):
"""Archive is not a subpage of page being archived and key not specified
(or incorrect)."""
def str2time(str):
"""Accepts a string defining a time period:
7d - 7 days
36h - 36 hours
Returns the corresponding time, measured in seconds."""
if str[-1] == 'd':
return int(str[:-1])*24*3600
elif str[-1] == 'h':
return int(str[:-1])*3600
else:
return int(str)
def str2size(str):
"""Accepts a string defining a size:
1337 - 1337 bytes
150K - 150 kilobytes
2M - 2 megabytes
Returns a tuple (size,unit), where size is an integer and unit is
'B' (bytes) or 'T' (threads)."""
if str[-1] in string.digits: #TODO: de-uglify
return (int(str),'B')
elif str[-1] == 'K':
return (int(str[:-1])*1024,'B')
elif str[-1] == 'M':
return (int(str[:-1])*1024*1024,'B')
elif str[-1] == 'T':
return (int(str[:-1]),'T')
else:
return (0,'B')
def int2month(num):
"""Returns the locale's full name of month 'num' (1-12)."""
return locale.nl_langinfo(locale.MON_1+num-1).decode('utf-8')
def int2month_short(num):
"""Returns the locale's abbreviated name of month 'num' (1-12)."""
return locale.nl_langinfo(locale.ABMON_1+num-1).decode('utf-8')
def txt2timestamp(txt, format):
"""Attempts to convert the timestamp 'txt' according to given 'format'.
On success, returns the time tuple; on failure, returns None."""
try:
return time.strptime(txt,format)
except ValueError:
return None
class DiscussionThread(object):
"""An object representing a discussion thread on a page, that is something of the form:
== Title of thread ==
Thread content here. ~~~~
:Reply, etc. ~~~~
"""
def __init__(self, title):
self.title = title
self.content = ""
self.timestamp = None
def __repr__(self):
return '%s("%s",%d bytes)' % (self.__class__.__name__,self.title,len(self.content))
def feedLine(self, line):
if not self.content and not line:
return
self.content += line + '\n'
#Postpone archiving if thread's marked with {{fuggoben}}
TM = re.search(ur'{{([Ss]ablon:)?[Ff]üggőben\s*($|\||}})', line)
if TM:
self.timestamp = time.time()
return
#Update timestamp
group0 = ''
TM = re.search(r'(\d\d\d\d)\. (\S+) (\d\d?)\., (\d\d):(\d\d) \(.*?\)', line)
if TM:
group0 = TM.group(0)
for m in [
[u'január', 'January'],
[u'február', 'February'],
[u'március', 'March'],
[u'április', 'April'],
[u'május', 'May'],
[u'június', 'June'],
[u'július', 'July'],
['augusztus', 'August'],
['szeptember', 'September'],
[u'október', 'October'],
['november', 'November'],
['december', 'December']
]:
rem = re.compile(m[0])
group0 = rem.sub(m[1], group0)
if not TM:
TM = re.search(r'(\d\d):(\d\d), (\d\d?) (\w+) (\d\d\d\d) \(.*?\)', line)
if not TM:
TM = re.search(r'(\d\d):(\d\d), (\w+) (\d\d?), (\d\d\d\d) \(.*?\)', line)
if TM:
if not group0:
group0 = TM.group(0)
TIME = txt2timestamp(group0,"%Y. %B %d., %H:%M (%Z)")
if not TIME:
TIME = txt2timestamp(group0,"%H:%M, %d %B %Y (%Z)")
if not TIME:
TIME = txt2timestamp(group0,"%H:%M, %d %b %Y (%Z)")
if not TIME:
TIME = txt2timestamp(group0,"%H:%M, %b %d %Y (%Z)")
if not TIME:
TIME = txt2timestamp(group0,"%H:%M, %b %d, %Y (%Z)")
if TIME:
self.timestamp = max(self.timestamp,time.mktime(TIME))
def size(self):
return len(self.title) + len(self.content) + 12
def toText(self):
return "== " + self.title + ' ==\n\n' + self.content
def shouldBeArchived(self,Archiver):
algo = Archiver.get('algo')
reT = re.search(r'^old\((.*)\)$',algo)
if reT:
if not self.timestamp:
return ''
#TODO: handle this:
#return 'unsigned'
maxage = str2time(reT.group(1))
if self.timestamp + maxage < time.time():
return message('OlderThanSummary') + ' ' + reT.group(1)
return ''
class DiscussionPage(object):
"""A class that represents a single discussion page as well as an archive page.
Feed threads to it and run an update() afterwards."""
#TODO: Make it a subclass of wikipedia.Page
def __init__(self, title, archiver, vars=None):
self.title = title
self.threads = []
self.Page = wikipedia.Page(Site,self.title)
self.full = False
self.archiver = archiver
self.vars = vars
try:
self.loadPage()
except wikipedia.NoPage:
self.header = archiver.get('archiveheader',message('InitialArchiveHeader'))
if self.vars:
self.header = self.header % self.vars
def loadPage(self):
"""Loads the page to be archived and breaks it up into threads."""
self.header = ''
self.threads = []
self.archives = {}
self.archivedThreads = 0
lines = self.Page.get().split('\n')
state = 0 #Reading header
curThread = None
for line in lines:
threadHeader = re.search('^== *([^=].*?) *==$',line)
if threadHeader:
state = 1 #Reading threads now
if curThread:
self.threads.append(curThread)
curThread = DiscussionThread(threadHeader.group(1))
else:
if state == 1:
curThread.feedLine(line)
else:
self.header += line + '\n'
if curThread:
self.threads.append(curThread)
def feedThread(self, thread, maxArchiveSize=(250*1024,'B')):
self.threads.append(thread)
self.archivedThreads += 1
if maxArchiveSize[1] == 'B':
if self.size() >= maxArchiveSize[0]:
self.full = True
elif maxArchiveSize[1] == 'T':
if len(self.threads) >= maxArchiveSize[0]:
self.full = True
return self.full
def size(self):
return len(self.header) + sum([t.size() for t in self.threads])
def update(self, summary, sortThreads = False):
if sortThreads:
wikipedia.output(u'Sorting threads...')
self.threads.sort(key = lambda t: t.timestamp)
newtext = re.sub('\n*$','\n\n',self.header) #Fix trailing newlines
for t in self.threads:
newtext += t.toText()
if self.full:
summary += ' ' + message('ArchiveFull')
self.Page.put(newtext, minorEdit=True, comment=summary)
class PageArchiver(object):
"""A class that encapsulates all archiving methods.
__init__ expects a wikipedia.Page object.
Execute by running the .run() method."""
algo = 'none'
pageSummary = message('PageSummary')
archiveSummary = message('ArchiveSummary')
def __init__(self, Page, tpl, salt, force=False):
self.attributes = {
'algo' : ['old(24h)',False],
'archive' : ['',False],
'maxarchivesize' : ['1000M',False],
'counter' : ['1',False],
'key' : ['',False],
}
self.tpl = tpl
self.salt = salt
self.force = force
self.Page = DiscussionPage(Page.title(),self)
self.loadConfig()
self.commentParams = {
'from' : self.Page.title,
}
self.archives = {}
self.archivedThreads = 0
def get(self, attr, default=''):
return self.attributes.get(attr,[default])[0]
def set(self, attr, value, out=True):
self.attributes[attr] = [value, out]
def saveables(self):
return [a for a in self.attributes if self.attributes[a][1] and a != 'maxage']
def attr2text(self):
return '{{%s\n%s\n}}' % (self.tpl,'\n'.join(['|%s = %s'%(a,self.get(a)) for a in self.saveables()]))
def key_ok(self):
return 1 # XXX key_ok() screws up with unicode somehow
s = new_hash()
s.update(self.salt+'\n')
s.update(self.Page.title+'\n')
return self.get('key') == s.hexdigest()
def loadConfig(self):
hdrlines = self.Page.header.split('\n')
mode = 0
for line in hdrlines:
if mode == 0 and re.search('{{'+self.tpl,line):
mode = 1
continue
if mode == 1 and re.match('}}',line):
break
attRE = re.search(r'^\| *(\w+) *= *(.*?) *$',line)
if mode == 1 and attRE:
self.set(attRE.group(1),attRE.group(2))
continue
if mode == 0 or not self.get('algo',''):
raise MissingConfigError
def feedArchive(self, archive, thread, maxArchiveSize, vars=None):
"""Feed the thread to one of the archives.
If it doesn't exist yet, create it.
If archive name is an empty string (or None), discard the thread (/dev/null).
Also checks for security violations."""
if not archive:
return
if not self.force and not self.Page.title+'/' == archive[:len(self.Page.title)+1] and not self.key_ok():
raise ArchiveSecurityError
if not self.archives.has_key(archive):
self.archives[archive] = DiscussionPage(archive,self,vars)
return self.archives[archive].feedThread(thread,maxArchiveSize)
def analyzePage(self):
maxArchSize = str2size(self.get('maxarchivesize'))
archCounter = int(self.get('counter','1'))
oldthreads = self.Page.threads
self.Page.threads = []
T = time.mktime(time.gmtime())
whys = []
for t in oldthreads:
if len(oldthreads) - self.archivedThreads <= int(self.get('minthreadsleft',5)):
self.Page.threads.append(t)
continue #Because there's too little threads left.
#TODO: Make an option so that unstamped (unsigned) posts get archived.
why = t.shouldBeArchived(self)
if why:
archive = self.get('archive')
TStuple = time.gmtime(t.timestamp)
vars = {
'counter' : archCounter,
'year' : TStuple[0],
'month' : TStuple[1],
'monthname' : int2month(TStuple[1]),
'monthnameshort' : int2month_short(TStuple[1]),
}
archive = archive % vars
if self.feedArchive(archive,t,maxArchSize,vars):
archCounter += 1
self.set('counter',str(archCounter))
whys.append(why)
self.archivedThreads += 1
else:
self.Page.threads.append(t)
return set(whys)
def run(self):
whys = self.analyzePage()
if self.archivedThreads < int(self.get('minthreadstoarchive',2)):
return #We might not want to archive a measly few threads (lowers edit frequency)
if whys:
#Save the archives first (so that bugs don't cause a loss of data)
for a in sorted(self.archives.keys()):
self.commentParams['count'] = self.archives[a].archivedThreads
comment = self.archiveSummary % self.commentParams
self.archives[a].update(comment)
#Save the page itself
rx = re.compile('{{'+self.tpl+'\n.*?\n}}',re.DOTALL)
self.Page.header = rx.sub(self.attr2text(),self.Page.header)
self.commentParams['count'] = self.archivedThreads
self.commentParams['archives'] = ', '.join(['[['+a.title+']]' for a in self.archives.values()])
if not self.commentParams['archives']:
self.commentParams['archives'] = '/dev/null'
self.commentParams['why'] = ', '.join(whys)
comment = self.pageSummary % self.commentParams
self.Page.update(comment)
def main():
from optparse import OptionParser
parser = OptionParser(usage='usage: %prog [options] [LINKPAGE(s)]')
parser.add_option('-f', '--file', dest='filename',
help='load list of pages from FILE', metavar='FILE')
parser.add_option('-p', '--page', dest='pagename',
help='archive a single PAGE', metavar='PAGE')
parser.add_option('-n', '--namespace', dest='namespace', type='int',
help='only archive pages from a given namespace')
parser.add_option('-s', '--salt', dest='salt',
help='specify salt')
parser.add_option('-F', '--force', action='store_true', dest='force',
help='override security options')
parser.add_option('-c', '--calc', dest='calc',
help='calculate key for PAGE and exit', metavar='PAGE')
parser.add_option('-l', '--locale', dest='locale',
help='switch to locale LOCALE', metavar='LOCALE')
parser.add_option('-T', '--timezone', dest='timezone',
help='switch timezone to TIMEZONE', metavar='TIMEZONE')
(options, args) = parser.parse_args()
if options.locale:
locale.setlocale(locale.LC_TIME,options.locale) #Required for english month names
if options.timezone:
os.environ['TZ'] = options.timezone
#Or use the preset value
time.tzset()
if options.calc:
if not options.salt:
parser.error('you must specify a salt to calculate a key')
s = new_hash()
s.update(options.salt+'\n')
s.update(options.calc+'\n')
wikipedia.output(u'key = ' + s.hexdigest())
return
if options.salt:
salt = options.salt
else:
salt = ''
if options.force:
force = True
else:
force = False
for a in args:
pagelist = []
if not options.filename and not options.pagename:
for pg in wikipedia.Page(Site,a).getReferences(follow_redirects=False,onlyTemplateInclusion=True):
pagelist.append(pg)
if options.filename:
for pg in file(options.filename,'r').readlines():
pagelist.append(wikipedia.Page(Site,pg))
if options.pagename:
#pagelist.append(wikipedia.Page(Site,unicode(options.pagename, 'cp1250')))
pagelist.append(wikipedia.Page(Site,options.pagename))
pagelist = sorted(pagelist)
if not options.namespace == None:
pagelist = [pg for pg in pagelist if pg.namespace()==options.namespace]
for pg in pagelist:
try: #Catching exceptions, so that errors in one page do not bail out the entire process
wikipedia.output(u'Processing page [[%s]]' % pg.title())
Archiver = PageArchiver(pg, a, salt, force)
Archiver.run()
time.sleep(10)
except:
wikipedia.output(u'Error occured while processing page [[%s]]' % pg.title())
traceback.print_exc()
if __name__ == '__main__':
main()
wikipedia.stopme()