Logo Search packages:      
Sourcecode: kanjisaver version File versions  Download package

kana2romaji.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# convert a file full of kana to romaji
# Copyright 2005 Ryan Schultz <schultz.ryan@gmail.com>
# GNU GPL v2 

import sys
import os
import re
import string
import codecs

# if it helps, imagine Art Metrano singing and doing magic tricks
# while reading this, because it's scary

print 'loading kana_list file'
f = codecs.open('kana_list', 'r', 'utf-8')
list = f.readlines()
kanalist = []
for line in list:
      t = line.split('  ')
      for e in t:
            t[t.index(e)] = e.strip().strip('\n')
      kanalist.append(t)

print 'opening document'
f = codecs.open(sys.argv[1], 'r', 'utf-8')
doc = f.read()

print 'subbing kana with romaji'
# reversed so that composite kana are subbed first
kanalist.reverse()
for kana in kanalist:
      doc = re.sub(kana[1], kana[0], doc)
      doc = re.sub(kana[2], kana[0], doc)

# double characters on sokuon (little tsu)
while True:
      result = string.find(doc, u'っ')
      if result == -1:
            break
      if doc[result+1] == '.' or doc[result+1] == ',' or doc[result+1] == '\n':
            doc = re.sub(doc[result], doc[result - 1], doc, 1)
      else:
            doc = re.sub(doc[result], doc[result + 1], doc, 1)
while True:
      result = string.find(doc, u'ッ')
      if result == -1:
            break
      if doc[result+1] == '.' or doc[result+1] == ',' or doc[result+1] == '\n':
            doc = re.sub(doc[result], doc[result - 1], doc, 1)
      else:
            doc = re.sub(doc[result], doc[result + 1], doc, 1)
      
f.close()

print 'writing converted file to %s' % (sys.argv[1]+'RMJI')
f = codecs.open(sys.argv[1]+'RMJI', 'w', 'utf-8')
f.write(doc)
f.close()
print 'done!'

Generated by  Doxygen 1.6.0   Back to index