User:FluffyBunnyBot/Source code

From the RuneScape Wiki, the wiki for all things RuneScape
Jump to: navigation, search
<?php

/**
 * htmlfix.php
 *
 * This bot automatically finds deprecated/nonexistent/invalid HTML and replaces it.
 *
 * This code is based off of hiscores.php written by Quarenon and improvements to this code were made by Supertech1.
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

define('API_URL', 'http://runescape.wikia.com/api.php'); // The URL of the api.php script to use.

define('DB_FILE', 'htmlfix.db'); // Storage of hiscore expiry data
define('LOCK_FILE', 'htmlfix.lock'); // Mutex lock for script
define('COOKIE_FILE', 'htmlfix.cookie'); // Storage of API cookie data

define('LOG_FORMAT', '[m/d/Y h:i:s A T] %\s'); // Log format

define('BOT_USERNAME', 'FluffyBunnyBot'); // Wiki username
define('BOT_PASSWORD', 'none of your business ^^'); // Wiki password

define('CATEGORY', 'Category:Archives'); // This category should contain pages with the hiscore template.

require_once('wikibot.php');

/**
 * Extract the namespace and base page from an article title.
 */
function splitTitle($title) {
    $temp = explode(':', $title);
    $pages = explode('/', $temp[1]);
    return array($temp[0], $pages[0]);
}

/**
 * Check for integer value (not type-strict)
 */
function isInt($val) {
    return ((string) $val) === ((string) (int) $val);
}

$bot = new WikiBot(API_URL, COOKIE_FILE);

$bot->setLogFormat(LOG_FORMAT);

// Prevent multiple invocations of this script.
$lock = fopen(LOCK_FILE, 'w+');
if (!flock($lock, (LOCK_EX | LOCK_NB))) {
    $bot->log('Unable to acquire lock - another instance may be running.');
    die();
}

$bot->log('Loading persistent database.');
$db = unserialize(@file_get_contents(DB_FILE));

$bot->login(BOT_USERNAME, BOT_PASSWORD);
sleep(15);

$next = '';

do {
	$titles = $bot->getAllPages(500, $next, '110'); /*http://runescape.wikia.com/api.php?action=query&meta=siteinfo&siprop=namespaces&format=xmlfm - list of namespaces & their numbers
	<ns id="-2" canonical="Media" xml:space="preserve">Media</ns>
      <ns id="-1" canonical="Special" xml:space="preserve">Special</ns>
      <ns id="0" subpages="" xml:space="preserve" />
      <ns id="1" subpages="" canonical="Talk" xml:space="preserve">Talk</ns>
      <ns id="2" subpages="" canonical="User" xml:space="preserve">User</ns>
      <ns id="3" subpages="" canonical="User talk" xml:space="preserve">User talk</ns>
      <ns id="4" subpages="" canonical="Project" xml:space="preserve">RuneScape</ns>
      <ns id="5" subpages="" canonical="Project talk" xml:space="preserve">RuneScape talk</ns>
      <ns id="6" subpages="" canonical="File" xml:space="preserve">File</ns>
      <ns id="7" subpages="" canonical="File talk" xml:space="preserve">File talk</ns>
      <ns id="8" subpages="" canonical="MediaWiki" xml:space="preserve">MediaWiki</ns>
      <ns id="9" subpages="" canonical="MediaWiki talk" xml:space="preserve">MediaWiki talk</ns>
      <ns id="10" subpages="" canonical="Template" xml:space="preserve">Template</ns>
      <ns id="11" subpages="" canonical="Template talk" xml:space="preserve">Template talk</ns>
      <ns id="12" subpages="" canonical="Help" xml:space="preserve">Help</ns>
      <ns id="13" subpages="" canonical="Help talk" xml:space="preserve">Help talk</ns>
      <ns id="14" subpages="" canonical="Category" xml:space="preserve">Category</ns>
      <ns id="15" subpages="" canonical="Category talk" xml:space="preserve">Category talk</ns>
      <ns id="100" canonical="Update" xml:space="preserve">Update</ns>
      <ns id="101" canonical="Update talk" xml:space="preserve">Update talk</ns>
      <ns id="110" subpages="" canonical="Forum" xml:space="preserve">Forum</ns>
      <ns id="111" subpages="" canonical="Forum talk" xml:space="preserve">Forum talk</ns>
      <ns id="112" canonical="Exchange" xml:space="preserve">Exchange</ns>
      <ns id="113" canonical="Exchange talk" xml:space="preserve">Exchange talk</ns> */
	

	if (array_key_exists('next', $titles)) {
		$next = $titles['next'];
		unset($titles['next']);
	} else {
		$next = '';
	}
	
	// Iterate over pages within the namespace
	foreach ($titles as $title) {
		list($ns, $username) = splitTitle($title);
		
		$page = $bot->getArticle($title);
		//$page = $bot->getArticle("User:Soldier 1033/test");
        	sleep(5);
			
		$text = $page;
		$text = preg_replace('#<font\s+color\s*=\s*["\']?(.*?)["\']?>([\s\S]*?)</font.*?>#i', '<span style="color: $1;">$2</span>', $text);
		$text = preg_replace('#<font\s+face\s*=\s*["\']?(.*?)["\']?>([\s\S]*?)</font.*?>#i', '<span style="font-family: $1;">$2</span>', $text);
		$text = preg_replace('#<font\s+style\s*=\s*["\']?(.*?)["\']?>([\s\S]*?)</font.*?>#i', '<span style="$1">$2</span>', $text);
		$text = preg_replace('#<font\s+size\s*=\s*["\']?(.*?)["\']?>([\s\S]*?)</font.*?>#i', '$2', $text);
		$text = preg_replace('#</span.*?>#i', '</span>', $text);
		$text = preg_replace('#<span\s+style\s*=\s*["\']?background:(.*?)["\']?>([\s\S]*?)</span.*?>#i', '<span style="background-color:$1">$2</span>', $text);
		$text = preg_replace('#<center.?>([\s\S]*?)</center.?>#i', '<div style="text-decoration:underline;">$1</div>', $text);
		$text = preg_replace('#<div\s+align\s*=\s*["\']?(.*?)["\']?>([\s\S]*?)</div.*?>#i', '<div style="text-align:$1;">$2</div>', $text);
		$text = preg_replace('#<big.?>([\s\S]*?)</big.?>#i', '$1', $text);
		$text = preg_replace('#<u.?>([\s\S]*?)</u.?>#i', '<span style="text-decoration:underline;">$1</span>', $text);
		$text = preg_replace("#<i.?>([\s\S]*?)</i.?>#i", "''$1''", $text);
		$text = preg_replace("#<b.?>([\s\S]*?)</b.?>#i", "'''$1'''", $text);
		$text = preg_replace('#<strike.?>([\s\S]*?)</strike.?>#i', '<span style="text-decoration:line-through;">$1</span>', $text);
		$text = preg_replace('#<s.?>([\s\S]*?)</s.?>#i', '<span style="text-decoration:line-through;">$1</span>', $text);
		/*
		$text = str_replace('<s>', '<span style="text-decoration:line-through;">', $text);
		$text = str_replace('</s>', '</span>', $text);
		$text = str_replace("<b>", "'''", $text);
		$text = str_replace("</b>", "'''", $text);
		$text = str_replace("<i>", "''", $text);
		$text = str_replace("</i>", "''", $text);
		$text = str_replace('<u>', '<span style="text-decoration:underline;">', $text);
		$text = str_replace('</u>', '</span>', $text);
		*/
		$text = str_replace("<br>", "<br />", $text);
		$text = str_replace(" ", " ", $text);
		
		if ($text == $page)
		{
			$bot->log('Skipping article ' . $title . ', as it has no detectable XHTML 2 incompatibilities.');
			continue;
		}
		
		$bot->log("Tidying up article '$title'...");
		try { $bot->editArticle($title, $text, 'Automated HTML tidying. If you find any errors, please report them [[User talk:Soldier 1033|here]].', true, true); } catch (Exception $e) { }
		//try { $bot->editArticle('User:Soldier 1033/test', $text, 'Automated HTML tidying. If you find any errors, please report them [[User talk:Soldier 1033|here]].', true, true); } catch (Exception $e) { }
	}
} while (!empty($next));

?>

Framework:

<?php

/**
 * The WikiBot class provides a framework for the MW Query API.
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 * @author Quarenon
 * @package WikiBot
 */

/**
 * The WikiBot class.
 * @package WikiBot
 */
 
 date_default_timezone_set('America/Chicago');
 
class WikiBot {
    private $url;

    private $cookie = 'wikibot.cookie';

    private $logFormat = '[m/d/Y H:i:s] %\s';
    private $logHandle = STDOUT;

    private $getDelay = 5;
    private $postDelay = 5;
    private $maxLag = 5;

    private $backoffStart = 15;
    private $backoffLimit = 300;
    private $backoffFactor = 2;
    private $backoffAttempts = 0;

    private $timeout = 15;

    private $readOnly = false;

    /**
     * Generate a WikiBot.
     *
     * @param string The URL of the target wiki's api.php script. By default, the URL for the English Wikipedia is used.
     */
    public function __construct($url = 'http://en.wikipedia.org/w/api.php') {
        $this->url = $url;
    }

    /**
     * Set the bot's API URL.
     *
     * @param string The URL of the target wiki's api.php script.
     * @return void
     */
    public function setURL($url) {
        $this->url = $url;
    }

    /**
     * Set request delays after each successful POST and GET request.
     *
     * @param int The delay after a GET request to the API.
     * @param int The delay after a POST request to the API.
     * @return void
     */
    public function setDelay($getDelay, $postDelay) {
        $this->getDelay = $getDelay;
        $this->postDelay = $postDelay;
    }

    /**
     * Set the max database lag for each request.  Higher values are more aggressive.
     *
     * @link http://www.mediawiki.org/wiki/Manual:Maxlag_parameter
     * @param int The maximum database lag, in seconds.
     * @return void
     */
    public function setMaxLag($maxLag) {
        $this->maxLag = $maxLag;
    }

    /**
     * Set the local cookie file used for login sessions.
     *
     * @param string The name of the cookie file.
     * @return void
     */
    public function setCookie($cookie) {
        $this->cookie = $cookie;
    }

    /**
     * Set the logging format for each log line.
     *
     * @param string Set a date() compatible format string, and use "%\s" for the log string itself.
     * @return void
     */
    public function setLogFormat($format) {
        $this->logFormat = $format;
    }

    /**
     * Set the file handle used for logging.
     *
     * @param resource|boolean An open handle to a writable file, or false to disable built-in logging.
     * @return void
     */
    public function setLogHandle($handle) {
        $this->logHandle = $handle;
    }

    /**
     * Write a log message.
     *
     * @param string A log string.
     * @return int|boolean The value returned by the call to fwrite() on the current log file handle, or false if logging was disabled.
     */
    public function log($str) {
        if ($this->logHandle !== false) {
            return fwrite($this->logHandle, sprintf(date($this->logFormat), $str) . "\n");
        } else {
            return false;
        }
    }

    /**
     * Set exponential backoff parameters for transient call errors.
     *
     * @param int The maximum number of attempts allowed before an Exception is thrown; use 0 for infinite attempts or 1 to disable all retries.
     * @param int The initial delay in seconds after the first retry.
     * @param int The maximum delay in seconds for any retry.
     * @param int The factor used to increase the delay for each successive retry.
     * @return void
     */
    public function setBackoff($maxAttempts, $startDelay = 15, $maxDelay = 300, $delayFactor = 2) {
        $this->backoffStart = $startDelay;
        $this->backoffLimit = $maxDelay;
        $this->backoffFactor = $delayFactor;
        $this->backoffAttempts = $maxAttempts;
    }

    /**
     * Set the maximum timeout for API calls.
     *
     * @param int The maximum timeout, in seconds.
     * @return void
     */
    public function setTimeout($timeout) {
        $this->timeout = $timeout;
    }

    /**
     * Set the read-only mode.  If read-only is enabled, any calls that might modify the wiki (such as editing) are disabled and immediately return.
     *
     * @param boolean True to enable read-only mode.
     * @return void
     */
    public function setReadOnly($readOnly) {
        $this->readOnly = $readOnly;
    }

    /**
     * Retrieve a template's parameters contained within the given wikitext.  If multiple templates
     * are found, each one is returned using an increasing numerical index.  The returned array
     * is accessed with $arr[templateIndex][parameterName].  Parameter names should be assigned
     * in a similar way that MW assigns them:  named parameters are indexed using their name and
     * unnamed parameters use an increasing numerical index.
     * 
     * This method is EXPERIMENTAL and may not handle all cases correctly.  In particular, there
     * may be problems handling templates in non-template namespaces.
     *
     * @param string The wikitext that contains the template.
     * @param string The name of the template.
     * @return array A two-dimensional array containing the template parameters.
     */
    public function parseTemplate($text, $template) {
        // Make spaces and underscores interchangeable.
        $template = preg_replace('/[ _]/', '[ _]', $template);

        // The template: match doesn't work this way.  Instead, there should be a test for a colon in the wikitext.
        // If it doesn't exist, then the template belongs in Template namespace, and only then is template: optional.
        preg_match_all('#{{\s*(template:)?' . $template . '\s*\|([\w\W]+?)}}#i', $text, $matches);
        $data = array();
        for ($i = 0; $i < count($matches[2]); $i++) {
            $params = explode('|', $matches[2][$i]);
            $j = 1;
            foreach ($params as $param) {
                list($name, $value) = array_map('trim', explode('=', $param));

                if ($name == '') {
                    continue;
                }

                // Unnamed parameters
                if ($value == '') {
                    $value = $name;
                    $name = $j++;
                }

                $data[$i][$name] = $value;
            }
        }
        return $data;
    }
    

    /**
     * Log into the wiki.  Session data is kept via cookies.
     * 
     * @param string The wiki username.
     * @param string The wiki password.
     * @return void
     * @throws Exception on login failure or API error.
     */
    public function login($username, $password) {
        $this->log('Logging into ' . $this->url . ' as ' . $username);

        $response = $this->callAPI(array(
            'action' => 'login',
            'lgname' => $username,
            'lgpassword' => $password,
        ), true);

        // MW >= 1.15.3 requires this
        if ($response['login']['result'] == 'NeedToken') {
            $response = $this->callAPI(array(
                'action' => 'login',
                'lgname' => $username,
                'lgpassword' => $password,
                'lgtoken' => $response['login']['token'],
            ), true);
        }
        
        if ($response['login']['result'] != 'Success') {
            throw new Exception($response['login']['result']);
        }
    }

    /**
     * Get the wikitext of an article.
     *
     * @param string The article title.
     * @param boolean Follow any redirects.
     * @return string The wikitext.
     * @throws Exception on API error.
     */
    public function getArticle($article, $followRedirects = true) {
        $this->log('Getting wikitext for article ' . $article);

        $call = array(
            'action' => 'query',
            'prop' => 'revisions',
            'titles' => $article,
            'rvprop' => 'content',
        );

        if ($followRedirects) {
            $call['redirects'] = '';
        }

        $response = $this->callAPI($call);

        $array = $response['query']['pages'];
        $array = array_shift($array);
        return $response['query']['pages'][$array['pageid']]['revisions'][0]['*'];
    }

    /**
     * Get a list of categories that an article belongs to.
     *
     * @param string The article title.
     * @param boolean Follow any redirects.
     * @return array An array of strings containing category names.
     * @throws Exception on API error.
     */
    public function getCategories($article, $followRedirects = true) {
        $this->log('Getting categories for article ' . $article);

        $call = array(
            'action' => 'query',
            'prop' => 'categories',
            'titles' => $article,
        );

        if ($followRedirects) {
            $call['redirects'] = '';
        }

        $response = $this->callAPI($call);

        $categories = array();

        foreach ($response['query']['pages'] as $page) {
            foreach ($page['categories'] as $category) {
                $categories[] = $category['title'];
            }
        }

        return $categories;
    }

    /**
     * Get all article titles within a specific namespace.
     *
     * @param int|string Limit of articles to return.  To use the maximum allowed limit for the current user, use 'max'.
     * @param string Start of the first article to return.
     * @param int The namespace to use.
     * @param string Filter to use on redirects.  Possible values are 'all', 'redirects', and 'nonredirects'.
     * @return array An array of strings containing article titles.  If the limit was reached, 'next' will be set to the first unreturned title.
     * @throws Exception on API error.
     */
    public function getAllPages($limit = 500, $start = '', $ns = 0, $redirects = 'nonredirects') {
        $this->log('Getting all pages in namespace ' . $ns . ' from ' . $start);

        $response = $this->callAPI(array(
            'action' => 'query',
            'list' => 'allpages',
            'apfrom' => $start,
            'apnamespace' => $ns,
            'apfilterredir' => 'nonredirects',
            'aplimit' => $limit
        ));
        
        $pages = array();
        foreach ($response['query']['allpages'] as $page) {
            $pages[] = $page['title'];
        }

        $next = $response['query-continue']['allpages']['apfrom'];
        if ($next != '') {
            $pages['next'] = $next;
        }

        return $pages;
    }

    /**
     * Get a list of articles that a page is embedded in (e.g. templates)
     *
     * @param string The page.
     * @param int|string Limit of articles to return.  To use the maximum allowed limit for the current user, use 'max'.
     * @param string Start of the first article to return.
     * @return array An array of strings containing article titles.  If the limit was reached, 'next' will be set to the first unreturned title.
     * @throws Exception on API error.
     */
    public function listEmbedded($article, $limit = 500, $start = false) {
        $this->log('Getting list of articles embedding ' . $article . ' from ' . $start);

        $call = array(
            'action' => 'query',
            'list' => 'embeddedin',
            'eititle' => $article,
            'eilimit' => $limit,
        );

        if ($start !== false) {
            $call['eicontinue'] = $start;
        }

        $response = $this->callAPI($call);

        $pages = array();
        foreach ($response['query']['embeddedin'] as $page) {
            $pages[] = $page['title'];
        }

        $next = $response['query-continue']['embeddedin']['eicontinue'];
        if ($next != '') {
            $pages['next'] = $next;
        }

        return $pages;
    }
        

    /**
     * Get a list of articles belonging to a category.
     *
     * @param string The category.
     * @param int|string Limit of articles to return.  To use the maximum allowed limit for the current user, use 'max'.
     * @param string Start of the first article to return.
     * @param int|string The namespace of the articles, or 'all' for any namespace.
     * @return array An array of strings containing article titles.  If the limit was reached, 'next' will be set to the first unreturned title.
     * @throws Exception on API error.
     */
    public function listCategory($category, $limit = 500, $start = '', $ns = 'all') {
        $call = array(
            'action' => 'query',
            'list' => 'categorymembers',
            'cmtitle' => $category,
            'cmlimit' => $limit,
        );

        if ($ns != 'all') {
            $call['cmnamespace'] = $ns;
        }

        $call['cmcontinue'] = $start;

        $this->log('Getting list of articles in ' . $category . ' from ' . $start);

        $response = $this->callAPI($call);
        
        $pages = array();
        foreach ($response['query']['categorymembers'] as $member) {
            $pages[] = $member['title'];
        }

        $next = $response['query-continue']['categorymembers']['cmcontinue'];
        if ($next != '') {
            $pages['next'] = $next;
        }

        return $pages;
    }

    /**
     * Edit an article.  If it doesn't exist, a new article is created.  If read-only is enabled, this call makes no edit and returns immediately.
     *
     * @param string The article title.
     * @param string The text of the article.
     * @param string The edit summary.
     * @param boolean If true, mark edit as minor.
     * @param boolean If true, mark edit with bot flag.
     * @param mixed The section to edit.  Sections are indexed numerically, with 0 as the top section.  Use 'new' to edit a new section, or set false to edit the entire article.
     * @return void
     * @throws Exception on API error.
     */
    public function editArticle($article, $text, $summary, $minor = false, $bot = true, $section = false) {
        if ($this->readOnly) {
            $this->log('Read-only is enabled; ' . $article . ' was not edited.');
            return;
        }

        $this->log('Obtaining edit information for ' . $article);        

        // Fetch edit token
        $response = $this->callAPI(array(
            'action' => 'query',
            'prop' => 'info|revisions',
            'intoken' => 'edit',
            'titles' => $article,
        ));

        $page = array_pop($response['query']['pages']);

        if (array_key_exists('missing', $page)) {
            $timestamp = $page['starttimestamp'];
        } else {
            $timestamp = $page['revisions'][0]['timestamp'];
        }

        $call = array(
            'title' => $article,
            'action' => 'edit',
            'basetimestamp' => $timestamp,
            'starttimestamp' => $page['starttimestamp'],
            'token' => $page['edittoken'],
            'summary' => $summary,
            'text' => $text
        );

        if ($section !== false) {
            $call['section'] = $section;
        }

        if ($bot) {
            $call['bot'] = 'yes';
        }

        if ($minor) {
            $call['minor'] = 'yes';
        } else {
            $call['notminor'] = 'yes';
        }

        $this->log('Editing ' . $article);

        $response = $this->callAPI($call, true);
    }

    /**
     * Call the MediaWiki API to interact with the wiki.
     *
     * @param array An associative array of key => value pairs that are arguments to the API.  If maxlag is provided, it overrides the global maxlag setting.
     * @param boolean If true, use a POST instead of a GET request.
     * @return void
     * @throws Exception on API error.
     */
    public function callAPI($data, $post = false) {
        $data['format'] = 'php';

        if (!array_key_exists('maxlag', $data)) {        
            $data['maxlag'] = $this->maxLag;
        }

        $backoff = $this->backoffStart;
        $attempts = 1;

        do {
            $ch = curl_init();
            curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
            curl_setopt($ch, CURLOPT_COOKIEJAR, $this->cookie);
            curl_setopt($ch, CURLOPT_COOKIEFILE, $this->cookie);
            curl_setopt($ch, CURLOPT_TIMEOUT, $this->timeout);    

            $query = '';
            foreach ($data as $key => $value) {
                $query .= '&' . $key . '=' . urlEncode($value);
            }

            if ($post) {
                curl_setopt($ch, CURLOPT_URL, $this->url);
                curl_setopt($ch, CURLOPT_POST, 1);
                curl_setopt($ch, CURLOPT_POSTFIELDS, $query);
                curl_setopt($ch, CURLOPT_HTTPHEADER, array('Content-Type: application/x-www-form-urlencoded;charset=UTF-8'));
                curl_setopt($ch, CURLOPT_HEADER, false);
            } else {
                curl_setopt($ch, CURLOPT_URL, $this->url . '?' . $query);
            }

            $response = unserialize(curl_exec($ch));
            $code = curl_getinfo($ch, CURLINFO_HTTP_CODE);

            $defer = false;

            if (curl_errno($ch)) {
                $defer = 'a cURL error: ' . curl_error($ch);
            } else if ($code != '200') {
                $defer = 'HTTP response code ' . $code;
            } else if ($response['error']['code'] == 'maxlag') {
                $defer = 'slave lag';
            } else if (isset($response['error'])) {
                // Other API errors might be non-recoverable errors.
                throw new Exception($response['error']['info']);
            }

            curl_close($ch);
            
            if ($defer) {
                if (!$this->backoffAttempts || $attempts < $this->backoffAttempts) {
                    $this->log('Retry #' . $attempts . ': deferring ' . $backoff . ' seconds due to ' . $defer);
                    sleep($backoff);
                    $backoff = min($this->backoffFactor * $backoff, $this->backoffLimit);
                    $attempts++;
                } else {
                    // Give up.
                    throw new Exception('Max attempts exceeded; last defer due to ' . $defer);
                }
            }
        } while ($defer);

        // Sleep delay depends on the call type.
        if ($post) {
            sleep($this->postDelay);
        } else {
            sleep($this->getDelay);
        }

        return $response;
    }
}

?>