Wikipedia コーパス
Wikipedia はそのダンプデータを公開しているので、自然言語処理の研究で利用するひとが結構いたりする。
なので、ためしに英語版 Wikipedia のダンプデータのXMLをパースしながら、適当に書式情報を削除して1項目1ファイルにして HyperEstraier の文書ドラフト形式で出力してみた。
日本語版で、抽出に 13 分、インデキシングに 75 分。英語版で、抽出に 60 分、インデキシングに 3 時間というところ。(CPU: Intel Xeon 2.66GHz)
#!/usr/bin/env python # -*- coding: utf-8 -*- import sys import os import codecs import re from itertools import izip, count from xml.etree import ElementTree from xml.sax.saxutils import unescape from urllib import quote NAMESPACE = 'http://www.mediawiki.org/xml/export-0.3/' def translate_path(s): return '/'.join('{%s}%s' % (NAMESPACE, tag) for tag in s.split('/')) FILES_PER_DIR = 1000 encoder = codecs.getencoder('utf-8') PATTERNS = { } def sub(pat, repl, string, flag = 0): if (pat, flag) not in PATTERNS: PATTERNS[(pat, flag)] = re.compile(pat, flag) return PATTERNS[(pat, flag)].sub(repl, string) def match(pat, string, flag = 0): if (pat, flag) not in PATTERNS: PATTERNS[(pat, flag)] = re.compile(pat, flag) return PATTERNS[(pat, flag)].match(string) def sub_iter(pat, repl, s, flag = 0): while True: t = sub(pat, repl, s, flag) if t == s: break else: s = t return t def format_link(m): s = m.group(1) if '|' in s: return s[s.rindex('|') + 1:] else: return s def get_namespace(title): lst = title.split(':', 2) if len(lst) > 1: return lst[0] else: return None class Page(object): __slots__ = 'title text timestamp author base_uri'.split() def __init__(self, title, timestamp, author, text, base_uri): self.title = title self.timestamp = timestamp self.author = author self.text = text self.base_uri = base_uri def toestdraft(self): template = ( '@uri=%(uri)s\n' '@title=%(title)s\n' '@author=%(author)s\n' '@mdate=%(timestamp)s\n' '\n' '%(text)s') dct = dict() dct['title'] = self.title dct['uri'] = self.base_uri + quote(encoder(self.title)[0]) dct['author'] = self.author dct['text'] = self.format_text(self.text) dct['timestamp'] = self.timestamp return template % dct def format_text(self, text): s = text s = sub(r'<!-- .*? -->', '', s, flag = re.X | re.S) s = sub(r'\[\[ [a-zA-Z][a-zA-Z-]+ : [^\]]+ \]\]', '', s, flag = re.X) s = sub(r'{{ lang [|] [^|]+ [|] ([^|]+) }}', r'\1', s, flag = re.X) s = sub(r'{{otheruses [|] [^}]+ }}', '', s, flag = re.X | re.I) s = sub(r'\[\[ Category: [^\]]+ \]\]', '', s, flag = re.X) # s = sub(r'{ [^|]* [|] [^|]* }', '', s, flag = re.X) s = sub(r'<.*?>', '', s) s = sub(r'^=+ \s* ([^=]+) \s* =+ \s* $', r'\1\n', s, flag = re.M | re.X) s = sub(r'[ ]* [*#]+ [ ]*', '', s, flag = re.X) s = sub_iter(r'\[\[ ([^\[\]]+) \]\]', format_link, s, flag = re.X) s = sub(r'\'{5}', '', s) s = sub(r'\'{3}', '', s) s = sub(r'\'{2}', '', s) s = sub(r'\[ https?://\S+ [ ] (.*?) \]', r'\1', s, flag = re.X) s = sub_iter(r'{{ [^|]+ (?: [|] [^{}]* )? }}', '', s, flag = re.X | re.S) s = sub(r'{ [|] .*? [|] }', '', s, flag = re.X | re.S) s = s.replace('\n\n', '\n') s = sub(r'</?[^>+]/?>', '', s) s = unescape(s) return s def pages(file_or_stream): context = iter(ElementTree.iterparse(file_or_stream, events = ('start', 'end'))) event, root = context.next() page_tag = translate_path('page') title_tag = translate_path('title') text_pattern = translate_path('revision/text') timestamp_pattern = translate_path('revision/timestamp') author_pattern = translate_path('revision/contributor/username') siteinfo_tag = translate_path('siteinfo') namespaces = set() for event, element in context: if event == 'end' and element.tag == siteinfo_tag: for el in element.findall(translate_path('namespaces/namespace')): if int(el.get('key')) != 0: namespaces.add(el.text) base_uri = element.findtext(translate_path('base')) base_uri = base_uri[:base_uri.rindex('/') + 1] break root.clear() namespace_pattern = '(%s):.*' % '|'.join(re.escape(s) for s in namespaces) namespace_pattern = re.compile(namespace_pattern) for event, element in context: if event == 'end': if element.tag == page_tag: title = element.findtext(title_tag) text = element.findtext(text_pattern) timestamp = element.findtext(timestamp_pattern) author = element.findtext(author_pattern) root.clear() if namespace_pattern.match(title): continue elif text.startswith('#REDIRECT'): continue page = Page(title = title, timestamp = timestamp, author = author, text = text, base_uri = base_uri) yield page def main(dest, args): if len(args) == 1: reader = args[0] else: reader = sys.stdin for i, page in enumerate(pages(reader)): if i % FILES_PER_DIR == 0: dir_name = '%05d' % (i / FILES_PER_DIR) target_dir = os.path.join(dest, dir_name) if not os.path.exists(target_dir): os.makedirs(target_dir) fname = '%08d.est' % i fp = codecs.open(os.path.join(target_dir, fname), 'w', 'utf-8') print >>fp, page.toestdraft().strip() fp.close() if __name__ == '__main__': import optparse parser = optparse.OptionParser() parser.add_option('-d', '--dest', dest = 'dest') (opts, args) = parser.parse_args() if opts.dest is None: parser.print_usage() sys.exit(2) main(dest = opts.dest, args = args)