253 lines
9.2 KiB
Python
253 lines
9.2 KiB
Python
# -*- coding: utf-8 -*-
|
|
#!/usr/bin/python
|
|
|
|
__author__ = "Ryan McGrath <ryan@venodesigns.net>"
|
|
__version__ = "1.0.1"
|
|
|
|
"""
|
|
A class that allows for easy transliteration of [Hirag/Katak]ana
|
|
and English/Latin. Much of the work here is inspired/inherited/etc
|
|
from Kim Ahlström and his work on "Ve", built in Ruby.
|
|
|
|
Credit where credit is due:
|
|
https://github.com/Kimtaro/ve/blob/master/lib/providers/japanese_transliterators.rb
|
|
"""
|
|
|
|
import re
|
|
|
|
# Lookup tables for character conversions. Much of this is borrowed from the work of
|
|
# Kim Ahlström and Ve: https://github.com/Kimtaro/ve/
|
|
#
|
|
# Ve's Transliterators are written in Ruby, and I wanted Python. Consider it a nice port. ;)
|
|
from translation_maps import H_SYLLABIC_N, H_SMALL_TSU, HIRA_TO_LATN, LATN_TO_HIRA
|
|
|
|
def defaultToSelfText(fn):
|
|
"""
|
|
A fun little decorator that makes it so we can default to
|
|
the text stored on a class instance, but also let people just
|
|
instantiate and re-use calls while supplying new text. Whee.
|
|
"""
|
|
def wrapper(self, text = None):
|
|
if text is None:
|
|
text = self.text
|
|
|
|
return fn(self, text = text)
|
|
|
|
return wrapper
|
|
|
|
class JapaneseTransliterator(object):
|
|
def __init__(self, text):
|
|
"""__init__(self, text)
|
|
|
|
JapaneseTransliterator("fadjfnjsfnjsafnjsdnf")
|
|
|
|
I envisioned storing the original text on the instantiated object
|
|
itself, and allowing it to be overridden on a per-function-call basis.
|
|
|
|
So I did.
|
|
|
|
Parameters:
|
|
text - Text to be operated on. Unicode please!
|
|
"""
|
|
self.text = text
|
|
|
|
@defaultToSelfText
|
|
def transliterate_from_hrkt_to_latn(self, text):
|
|
"""transliterate_from_hrkt_to_latn(self, text)
|
|
|
|
Transliterates from [Hirag/Katak]ana to Latin/En.
|
|
|
|
Parameters:
|
|
text - Optional. Use different text than what's on
|
|
the class instance.
|
|
"""
|
|
text = self.transliterate_from_kana_to_hira(text)
|
|
return self.transliterate_from_hira_to_latn(text)
|
|
|
|
@defaultToSelfText
|
|
def transliterate_from_hira_to_latn(self, text):
|
|
"""transliterate_from_hira_to_latn(self, text)
|
|
|
|
Transliterates from Hiragana to Latin/En. Phonetics, that is.
|
|
|
|
Parameters:
|
|
text - Optional. Use different text than what's on
|
|
the class instance.
|
|
"""
|
|
# Decode once, not twice
|
|
_H_SMALL_TSU = H_SMALL_TSU.decode('utf-8')
|
|
_H_SYLLABIC_N = H_SYLLABIC_N.decode('utf-8')
|
|
|
|
kana = (text * 1).decode('utf-8')
|
|
romaji = ''
|
|
geminate = False
|
|
|
|
index = 0
|
|
klength = len(kana)
|
|
|
|
while klength > 0:
|
|
for length in [2, 1]:
|
|
mora = ''
|
|
for_conversion = kana[index:(index + length)]
|
|
|
|
if for_conversion == _H_SMALL_TSU:
|
|
geminate = True
|
|
index += length
|
|
klength -= length
|
|
break
|
|
|
|
elif for_conversion == _H_SYLLABIC_N and re.match(u'[\u3084-\u3088]', kana[(index + 1):(index + 2)]):
|
|
# Syllabic N before ya, yu or yo
|
|
mora = "n'"
|
|
elif for_conversion in HIRA_TO_LATN:
|
|
mora = HIRA_TO_LATN[for_conversion]
|
|
|
|
if len(mora) > 0:
|
|
if geminate:
|
|
geminate = False
|
|
romaji += mora[index:index + 1]
|
|
|
|
romaji += mora
|
|
index += length
|
|
klength -= length
|
|
break
|
|
elif length == 1:
|
|
romaji += for_conversion
|
|
index += length
|
|
klength -= length
|
|
|
|
return romaji
|
|
|
|
@defaultToSelfText
|
|
def transliterate_from_latn_to_hrkt(self, text):
|
|
"""transliterate_from_latn_to_hrkt(self, text)
|
|
|
|
Transliterates from Latin/En to Hiragana (mostly).
|
|
|
|
Parameters:
|
|
text - Optional. Use different text than what's on
|
|
the class instance.
|
|
"""
|
|
# Duplicate the text...
|
|
romaji = text * 1
|
|
kana = ''
|
|
|
|
romaji = re.sub('/m([BbPp])/', 'n\1', romaji)
|
|
romaji = re.sub('/M([BbPp])/', 'N\1', romaji)
|
|
|
|
index = 0
|
|
rlength = len(romaji) - 1
|
|
|
|
while rlength > 0:
|
|
for for_removal in [3, 2, 1]:
|
|
mora = ''
|
|
for_conversion = romaji[index:(index + for_removal)]
|
|
is_upper = True if re.search('[A-Z][^A-Z]*', for_conversion) else False
|
|
for_conversion = for_conversion.lower()
|
|
|
|
if re.match('/nn[aiueo]/', for_conversion):
|
|
mora = H_SYLLABIC_N
|
|
for_removal = 1
|
|
elif for_conversion in LATN_TO_HIRA:
|
|
mora = LATN_TO_HIRA[for_conversion]
|
|
elif for_conversion == 'tch' or (for_removal == 2 and re.match('/([kgsztdnbpmyrlwc])\1/', for_conversion)):
|
|
mora = H_SMALL_TSU
|
|
for_removal = 1
|
|
|
|
if mora != '':
|
|
if is_upper:
|
|
kana += self.transliterate_from_hira_to_kana(text = (mora * 1))
|
|
else:
|
|
kana += mora
|
|
|
|
index += for_removal
|
|
rlength -= for_removal
|
|
break
|
|
elif for_removal == 1:
|
|
kana += for_conversion
|
|
index += 1
|
|
rlength -= 1
|
|
|
|
return kana
|
|
|
|
@defaultToSelfText
|
|
def transliterate_from_kana_to_hira(self, text):
|
|
"""transliterate_from_kana_to_hira(self, text)
|
|
|
|
Transliterates from Katakana to Hiragana.
|
|
|
|
Parameters:
|
|
text - Optional. Use different text than what's on
|
|
the class instance.
|
|
"""
|
|
return JapaneseTransliterator.transpose_codepoints_in_range(text, -96, 12449, 12534)
|
|
|
|
@defaultToSelfText
|
|
def transliterate_from_hira_to_kana(self, text):
|
|
"""transliterate_from_hira_to_kana(self, text)
|
|
|
|
Transliterates from Hiragana to Katakana.
|
|
|
|
Parameters:
|
|
text - Optional. Use different text than what's on
|
|
the class instance.
|
|
"""
|
|
return JapaneseTransliterator.transpose_codepoints_in_range(text, 96, 12353, 12438)
|
|
|
|
@defaultToSelfText
|
|
def transliterate_from_fullwidth_to_halfwidth(self, text):
|
|
"""transliterate_from_fullwidth_to_halfwidth(self, text)
|
|
|
|
Transliterates from full-width to half-width.
|
|
|
|
Parameters:
|
|
text - Optional. Use different text than what's on
|
|
the class instance.
|
|
"""
|
|
text = JapaneseTransliterator.transpose_codepoints_in_range(text, -65248, 65281, 65374)
|
|
return JapaneseTransliterator.transpose_codepoints_in_range(text, -12256, 12288, 12288)
|
|
|
|
@defaultToSelfText
|
|
def transliterate_from_halfwidth_to_fullwidth(self, text):
|
|
"""transliterate_from_fullwidth_to_halfwidth(self, text)
|
|
|
|
Transliterates from half-width to full-width.
|
|
|
|
Parameters:
|
|
text - Optional. Use different text than what's on
|
|
the class instance.
|
|
"""
|
|
text = JapaneseTransliterator.transpose_codepoints_in_range(text, 65248, 33, 126)
|
|
return JapaneseTransliterator.transpose_codepoints_in_range(text, 12256, 32, 32)
|
|
|
|
@staticmethod
|
|
def transpose_codepoints_in_range(text, distance, range_start, range_end):
|
|
"""JapaneseTransliterator.transpose_codepoints_in_range(text, distance, range_start, range_end)
|
|
|
|
Given a set of text (unicode...), coupled with distance and range, transposes
|
|
it for a corresponding swap and returns the new set.
|
|
|
|
Parameters:
|
|
text - text to be transposed, codepoint-wise
|
|
distance - to the other side of the map
|
|
range_start - start of the range we're interested in, codepont-wise
|
|
range_end - end of the range we're interested in, codepoint-wise
|
|
|
|
Returns:
|
|
string, text, etc
|
|
"""
|
|
if not isinstance(text, unicode):
|
|
# Python will raise a UnicodeEncodeError here if there are any
|
|
# outstanding issues, otherwise things should be fine. *shrug*
|
|
text = unicode(text, 'utf-8')
|
|
|
|
transposed_text = u''
|
|
codepoints = map(lambda char: ord(char), list(text))
|
|
|
|
for codepoint in codepoints:
|
|
if codepoint >= range_start and codepoint <= range_end:
|
|
transposed_text += unichr(codepoint + distance)
|
|
else:
|
|
transposed_text += unichr(codepoint)
|
|
|
|
return transposed_text
|