Initial, etc

This commit is contained in:
Ryan McGrath 2012-04-11 06:30:19 -04:00
commit 4e04fbe997
6 changed files with 524 additions and 0 deletions

21
LICENSE Normal file
View file

@ -0,0 +1,21 @@
The MIT License
Copyright (c) 2009 - 2010 Ryan McGrath
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.

254
jTransliterate/__init__.py Normal file
View file

@ -0,0 +1,254 @@
# -*- coding: utf-8 -*-
#!/usr/bin/python
__author__ = "Ryan McGrath <ryan@venodesigns.net>"
__version__ = "1.0"
"""
A class that allows for easy transliteration of [Hirag/Katak]ana
and English/Latin. Much of the work here is inspired/inherited/etc
from Kim Ahlström and his work on "Ve", built in Ruby.
Credit where credit is due:
https://github.com/Kimtaro/ve/blob/master/lib/providers/japanese_transliterators.rb
"""
import re
# Lookup tables for character conversions. Much of this is borrowed from the work of
# Kim Ahlström and Ve: https://github.com/Kimtaro/ve/
#
# Ve's Transliterators are written in Ruby, and I wanted Python. Consider it a nice port. ;)
from translation_maps import H_SYLLABIC_N, H_SMALL_TSU, HIRA_TO_LATN, LATN_TO_HIRA
def defaultToSelfText(fn):
"""
A fun little decorator that makes it so we can default to
the text stored on a class instance, but also let people just
instantiate and re-use calls while supplying new text. Whee.
"""
def wrapper(self, text = None):
if text is None:
text = self.text
return fn(self, text = text)
return wrapper
class JapaneseTransliterator(object):
def __init__(self, text):
"""__init__(self, text)
JapaneseTransliterator("fadjfnjsfnjsafnjsdnf")
I envisioned storing the original text on the instantiated object
itself, and allowing it to be overridden on a per-function-call basis.
So I did.
Parameters:
text - Text to be operated on. Unicode please!
"""
self.text = text
@defaultToSelfText
def transliterate_from_hrkt_to_latn(self, text):
"""transliterate_from_hrkt_to_latn(self, text)
Transliterates from [Hirag/Katak]ana to Latin/En.
Parameters:
text - Optional. Use different text than what's on
the class instance.
"""
text = self.transliterate_from_kana_to_hira(text)
return self.transliterate_from_hira_to_latn(text)
@defaultToSelfText
def transliterate_from_hira_to_latn(self, text):
"""transliterate_from_hira_to_latn(self, text)
Transliterates from Hiragana to Latin/En. Phonetics, that is.
Parameters:
text - Optional. Use different text than what's on
the class instance.
"""
# Decode once, not twice
_H_SMALL_TSU = H_SMALL_TSU.decode('utf-8')
_H_SYLLABIC_N = H_SYLLABIC_N.decode('utf-8')
kana = (text * 1).decode('utf-8')
romaji = ''
geminate = False
index = 0
klength = len(kana)
while klength > 0:
for length in [2, 1]:
mora = ''
for_conversion = kana[index:(index + length)]
if for_conversion == _H_SMALL_TSU:
geminate = True
index += length
klength -= length
break
elif for_conversion == _H_SYLLABIC_N and re.match(u'[\u3084-\u3088]', kana[(index + 1):(index + 2)]):
# Syllabic N before ya, yu or yo
mora = "n'"
elif for_conversion in HIRA_TO_LATN:
mora = HIRA_TO_LATN[for_conversion]
if len(mora) > 0:
if geminate:
geminate = False
romaji += mora[index:index + 1]
romaji += mora
index += length
klength -= length
break
elif length == 1:
romaji += for_conversion
index += length
klength -= length
return romaji
@defaultToSelfText
def transliterate_from_latn_to_hrkt(self, text):
"""transliterate_from_latn_to_hrkt(self, text)
Transliterates from Latin/En to Hiragana (mostly).
Parameters:
text - Optional. Use different text than what's on
the class instance.
"""
# Duplicate the text...
romaji = text * 1
kana = ''
romaji = re.sub('/m([BbPp])/', 'n\1', romaji)
romaji = re.sub('/M([BbPp])/', 'N\1', romaji)
index = 0
rlength = len(romaji) - 1
while rlength > 0:
for for_removal in [3, 2, 1]:
mora = ''
for_conversion = romaji[index:(index + for_removal)]
is_upper = True if re.search('[A-Z][^A-Z]*', for_conversion) else False
for_conversion = for_conversion.lower()
if re.match('/nn[aiueo]/', for_conversion):
mora = H_SYLLABIC_N
for_removal = 1
elif for_conversion in LATN_TO_HIRA:
mora = LATN_TO_HIRA[for_conversion]
elif for_conversion == 'tch' or (for_removal == 2 and re.match('/([kgsztdnbpmyrlwc])\1/', for_conversion)):
mora = H_SMALL_TSU
for_removal = 1
if mora != '':
if is_upper:
kana += self.transliterate_from_hira_to_kana(text = (mora * 1))
else:
kana += mora
index += for_removal
rlength -= for_removal
break
elif for_removal == 1:
kana += for_conversion
index += 1
rlength -= 1
return kana
@defaultToSelfText
def transliterate_from_kana_to_hira(self, text):
"""transliterate_from_kana_to_hira(self, text)
Transliterates from Katakana to Hiragana.
Parameters:
text - Optional. Use different text than what's on
the class instance.
"""
return JapaneseTransliterator.transpose_codepoints_in_range(text, -96, 12449, 12534)
@defaultToSelfText
def transliterate_from_hira_to_kana(self, text):
"""transliterate_from_hira_to_kana(self, text)
Transliterates from Hiragana to Katakana.
Parameters:
text - Optional. Use different text than what's on
the class instance.
"""
return JapaneseTransliterator.transpose_codepoints_in_range(text, 96, 12353, 12438)
@defaultToSelfText
def transliterate_from_fullwidth_to_halfwidth(self, text):
"""transliterate_from_fullwidth_to_halfwidth(self, text)
Transliterates from full-width to half-width.
Parameters:
text - Optional. Use different text than what's on
the class instance.
"""
text = JapaneseTransliterator.transpose_codepoints_in_range(text, -65248, 65281, 65374)
return JapaneseTransliterator.transpose_codepoints_in_range(text, -12256, 12288, 12288)
@defaultToSelfText
def transliterate_from_halfwidth_to_fullwidth(self, text):
"""transliterate_from_fullwidth_to_halfwidth(self, text)
Transliterates from half-width to full-width.
Parameters:
text - Optional. Use different text than what's on
the class instance.
"""
text = JapaneseTransliterator.transpose_codepoints_in_range(text, 65248, 33, 126)
return JapaneseTransliterator.transpose_codepoints_in_range(text, 12256, 32, 32)
@staticmethod
def transpose_codepoints_in_range(text, distance, range_start, range_end):
"""JapaneseTransliterator.transpose_codepoints_in_range(text, distance, range_start, range_end)
Given a set of text (unicode...), coupled with distance and range, transposes
it for a corresponding swap and returns the new set.
Parameters:
text - text to be transposed, codepoint-wise
distance - to the other side of the map
range_start - start of the range we're interested in, codepont-wise
range_end - end of the range we're interested in, codepoint-wise
Returns:
string, text, etc
"""
if not isinstance(text, unicode):
# Python will raise a UnicodeEncodeError here if there are any
# outstanding issues, otherwise things should be fine. *shrug*
text = unicode(text, 'utf-8')
transposed_text = u''
codepoints = map(lambda char: ord(char), list(text))
for codepoint in codepoints:
print codepoint
if codepoint >= range_start and codepoint <= range_end:
transposed_text += unichr(codepoint + distance)
else:
transposed_text += unichr(codepoint)
return transposed_text

30
jTransliterate/test.py Normal file
View file

@ -0,0 +1,30 @@
# -*- coding: utf-8 -*-
from __init__ import JapaneseTransliterator
# Transliterate from Latin/English to [Hirag/Katak]ana
x = JapaneseTransliterator('kanazawa')
print x.transliterate_from_latn_to_hrkt()
# Should print "かなざわ"
# Transliterate from Hiragana to Latin/English
b = JapaneseTransliterator('かなざわ')
print b.transliterate_from_hira_to_latn()
# Should print "kanazawa"
# Transliterate from either Hiragana or Katakana to Latin/English
print b.transliterate_from_hrkt_to_latn(text = 'カナザワ')
# Should print "kanazawa"
# Transliterate from Katakan to Hiragana (You... probably never need to do this)
print b.transliterate_from_kana_to_hira(text = 'キットカート')
# Should print "きっとかーと"
# Transliterate from Hiragana to Katakana
print b.transliterate_from_hira_to_kana(text = 'かなざわ')
# Should print "カナザワ"
# If you want to convert between half/full width kana, you can use the following
# functions. I didn't care enough to do demos here. ;|
b.transliterate_from_halfwidth_to_fullwidth()
b.transliterate_from_fullwidth_to_halfwidth()

View file

@ -0,0 +1,128 @@
# -*- coding: utf-8 -*-
H_SYLLABIC_N= ''
H_SMALL_TSU = ''
"""
Python sucks with regards to unicode-fun, but I'm leaving this here as
a fun reference for anyone deciphering all this. Enjoy. -- Ryan
HIRA_TO_LATN = {
"":"a", "":"i", "":"", "":"e", "":"o",
"":"ka", "":"ki", "":"k", "":"ke", "":"ko",
"":"ga", "":"gi", "":"g", "":"ge", "":"go",
"":"sa", "":"shi", "":"s", "":"se", "":"so",
"":"za", "":"ji", "":"z", "":"ze", "":"zo",
"":"ta", "":"chi", "":"ts", "":"te", "":"to",
"":"da", "":"ji", "":"z", "":"de", "":"do",
"":"na", "":"ni", "":"n", "":"ne", "":"no",
"":"ha", "":"hi", "":"f", "":"he", "":"ho",
"":"ba", "":"bi", "":"b", "":"be", "":"bo",
"":"pa", "":"pi", "":"p", "":"pe", "":"po",
"":"ma", "":"mi", "":"m", "":"me", "":"mo",
"":"ya", "":"y", "":"yo",
"":"ra", "":"ri", "":"r", "":"re", "":"ro",
"":"wa", "うぃ":"whi", "うぇ":"whe", "":"wo",
"":"wye", "":"wyi", "":"-", "":"n",
"きゃ":"kya", "きゅ":"ky", "きょ":"kyo", "きぇ":"kye", "きぃ":"kyi",
"ぎゃ":"gya", "ぎゅ":"gy", "ぎょ":"gyo", "ぎぇ":"gye", "ぎぃ":"gyi",
"くぁ":"kwa", "くぃ":"kwi", "くぅ":"kw", "くぇ":"kwe", "くぉ":"kwo",
"ぐぁ":"qwa", "ぐぃ":"gwi", "ぐぅ":"gw", "ぐぇ":"gwe", "ぐぉ":"gwo",
"しゃ":"sha", "しぃ":"syi", "しゅ":"sh", "しぇ":"she", "しょ":"sho",
"じゃ":"jya", "じゅ":"zy", "じぇ":"zye", "じょ":"zyo", "じぃ":"zyi",
"すぁ":"swa", "すぃ":"swi", "すぅ":"sw", "すぇ":"swe", "すぉ":"swo",
"ちゃ":"tya", "ちゅ":"ty", "ちぇ":"tye", "ちょ":"tyo", "ちぃ":"tyi",
"ぢゃ":"dya", "ぢぃ":"dyi", "ぢゅ":"dy", "ぢぇ":"dye", "ぢょ":"dyo",
"つぁ":"tsa", "つぃ":"tsi", "つぇ":"tse", "つぉ":"tso", "てゃ":"tha",
"てぃ":"thi", "てゅ":"th", "てぇ":"the", "てょ":"tho", "とぁ":"twa",
"とぃ":"twi", "とぅ":"tw", "とぇ":"twe", "とぉ":"two", "でゃ":"dha",
"でぃ":"dhi", "でゅ":"dh", "でぇ":"dhe", "でょ":"dho", "どぁ":"dwa",
"どぃ":"dwi", "どぅ":"dw", "どぇ":"dwe", "どぉ":"dwo", "にゃ":"nya",
"にゅ":"ny", "にょ":"nyo", "にぇ":"nye", "にぃ":"nyi", "ひゃ":"hya",
"ひぃ":"hyi", "ひゅ":"hy", "ひぇ":"hye", "ひょ":"hyo", "びゃ":"bya",
"びぃ":"byi", "びゅ":"by", "びぇ":"bye", "びょ":"byo", "ぴゃ":"pya",
"ぴぃ":"pyi", "ぴゅ":"py", "ぴぇ":"pye", "ぴょ":"pyo", "ふぁ":"fwa",
"ふぃ":"fyi", "ふぇ":"fye", "ふぉ":"fwo", "ふぅ":"fw", "ふゃ":"fya",
"ふゅ":"fy", "ふょ":"fyo", "みゃ":"mya", "みぃ":"myi", "みゅ":"my",
"みぇ":"mye", "みょ":"myo", "りゃ":"rya", "りぃ":"ryi", "りゅ":"ry",
"りぇ":"rye", "りょ":"ryo",
"ゔぁ":"va", "ゔぃ":"vyi", "":"v", "ゔぇ":"vye", "ゔぉ":"vo",
"ゔゃ":"vya", "ゔゅ":"vy", "ゔょ":"vyo",
"うぁ":"wha", "いぇ":"ye", "うぉ":"who",
"":"xa", "":"xi", "":"x", "":"xe", "":"xo",
"":"xka", "":"xke", "":"xwa"
}
"""
HIRA_TO_LATN = {u'\u3057\u3047': 'she', u'\u3057\u3043': 'syi', u'\u308b': 'ru', u'\u3093': 'n', u'\u3074\u3047': 'pye', u'\u3074\u3043': 'pyi', u'\u304d\u3085': 'kyu', u'\u304d\u3087': 'kyo', u'\u304d\u3083': 'kya', u'\u3067\u3047': 'dhe', u'\u3050\u3041': 'qwa', u'\u3067\u3043': 'dhi', u'\u3094\u3087': 'vyo', u'\u308a\u3043': 'ryi', u'\u3094\u3083': 'vya', u'\u3048': 'e', u'\u3050': 'gu', u'\u3058': 'ji', u'\u3060': 'da', u'\u3064\u3049': 'tso', u'\u3064\u3047': 'tse', u'\u3064\u3043': 'tsi', u'\u3064\u3041': 'tsa', u'\u3070': 'ba', u'\u3078': 'he', u'\u3080': 'mu', u'\u3088': 'yo', u'\u3043': 'xi', u'\u3090': 'wyi', u'\u3050\u3043': 'gwi', u'\u3072': 'hi', u'\u3050\u3047': 'gwe', u'\u3050\u3045': 'gwu', u'\u3050\u3049': 'gwo', u'\u3057\u3087': 'sho', u'\u3057\u3085': 'shu', u'\u3057\u3083': 'sha', u'\u304b': 'ka', u'\u3053': 'ko', u'\u3074\u3087': 'pyo', u'\u305b': 'se', u'\u3074\u3085': 'pyu', u'\u3074\u3083': 'pya', u'\u304d\u3047': 'kye', u'\u3068\u3041': 'twa', u'\u304d\u3043': 'kyi', u'\u306b': 'ni', u'\u3067\u3087': 'dho', u'\u3067\u3085': 'dhu', u'\u3067\u3083': 'dha', u'\u3094\u3049': 'vo', u'\u3094\u3047': 'vye', u'\u307b': 'ho', u'\u3094\u3043': 'vyi', u'\u3094\u3041': 'va', u'\u3081': 'me', u'\u3089': 'ra', u'\u3091': 'wye', u'\u3046\u3041': 'wha', u'\u3046\u3043': 'whi', u'\u3046\u3047': 'whe', u'\u3073\u3083': 'bya', u'\u3046\u3049': 'who', u'\u3073\u3087': 'byo', u'\u3073\u3085': 'byu', u'\u3066\u3083': 'tha', u'\u3066\u3085': 'thu', u'\u3066\u3087': 'tho', u'\u3046': 'u', u'\u304e': 'gi', u'\u3056': 'za', u'\u308a\u3047': 'rye', u'\u305e': 'zo', u'\u3094\u3085': 'vyu', u'\u3066': 'te', u'\u306e': 'no', u'\u3076': 'bu', u'\u307e': 'ma', u'\u3059\u3049': 'swo', u'\u3086': 'yu', u'\u3059\u3041': 'swa', u'\u3059\u3043': 'swi', u'\u3059\u3045': 'swu', u'\u3059\u3047': 'swe', u'\u308e': 'xwa', u'\u3096': 'xke', u'\u308a\u3085': 'ryu', u'\u308a\u3087': 'ryo', u'\u308a\u3083': 'rya', u'\u3073': 'bi', u'\u3069\u3049': 'dwo', u'\u3069\u3041': 'dwa', u'\u3069\u3043': 'dwi', u'\u3069\u3045': 'dwu', u'\u3069\u3047': 'dwe', u'\u3041': 'xa', u'\u3049': 'xo', u'\u3051': 'ke', u'\u3073\u3043': 'byi', u'\u3073\u3047': 'bye', u'\u3061': 'chi', u'\u3069': 'do', u'\u3071': 'pa', u'\u3066\u3043': 'thi', u'\u3066\u3047': 'the', u'\u3079': 'be', u'\u308f': 'wa', u'\u3062\u3085': 'dyu', u'\u3062\u3087': 'dyo', u'\u3062\u3083': 'dya', u'\u307f\u3087': 'myo', u'\u307f\u3085': 'myu', u'\u307f\u3083': 'mya', u'\u3044': 'i', u'\u304c': 'ga', u'\u3072\u3085': 'hyu', u'\u3072\u3087': 'hyo', u'\u3054': 'go', u'\u3072\u3083': 'hya', u'\u305c': 'ze', u'\u3064': 'tsu', u'\u304f\u3049': 'kwo', u'\u304f\u3047': 'kwe', u'\u304f\u3045': 'kwu', u'\u304f\u3043': 'kwi', u'\u306c': 'nu', u'\u304f\u3041': 'kwa', u'\u3074': 'pi', u'\u3068': 'to', u'\u307c': 'bo', u'\u3084': 'ya', u'\u308c': 're', u'\u3072\u3047': 'hye', u'\u3094': 'vu', u'\u3072\u3043': 'hyi', u'\u3045': 'xu', u'\u3047': 'xe', u'\u304f': 'ku', u'\u3057': 'shi', u'\u305f': 'ta', u'\u3062\u3047': 'dye', u'\u3067': 'de', u'\u3062\u3043': 'dyi', u'\u306f': 'ha', u'\u3077': 'pu', u'\u307f\u3047': 'mye', u'\u307f\u3043': 'myi', u'\u30fc': '-', u'\u307f': 'mi', u'\u306b\u3083': 'nya', u'\u306b\u3087': 'nyo', u'\u306b\u3085': 'nyu', u'\u308d': 'ro', u'\u3059': 'su', u'\u3095': 'xka', u'\u304e\u3043': 'gyi', u'\u304e\u3047': 'gye', u'\u3042': 'a', u'\u3058\u3043': 'zyi', u'\u304a': 'o', u'\u3058\u3047': 'zye', u'\u3052': 'ge', u'\u3075\u3049': 'fwo', u'\u3075\u3045': 'fwu', u'\u3075\u3047': 'fye', u'\u305a': 'zu', u'\u3075\u3041': 'fwa', u'\u3075\u3043': 'fyi', u'\u3061\u3083': 'tya', u'\u3062': 'ji', u'\u3061\u3085': 'tyu', u'\u3061\u3087': 'tyo', u'\u306a': 'na', u'\u3044\u3047': 'ye', u'\u3068\u3049': 'two', u'\u3068\u3043': 'twi', u'\u307a': 'pe', u'\u3068\u3047': 'twe', u'\u3068\u3045': 'twu', u'\u3082': 'mo', u'\u3058\u3083': 'jya', u'\u308a': 'ri', u'\u3058\u3087': 'zyo', u'\u3058\u3085': 'zyu', u'\u3092': 'wo', u'\u3075\u3085': 'fyu', u'\u3075\u3087': 'fyo', u'\u3075\u3083': 'fya', u'\u3061\u3043': 'tyi', u'\u3061\u3047': 'tye', u'\u306b\u3043': 'nyi', u'\u306b\u3047': 'nye', u'\u304d': 'ki', u'\u3055': 'sa', u'\u305d': 'so', u'\u3065': 'zu', u'\u304e\u3083': 'gya', u'\u306d': 'ne', u'\u304e\u3085': 'gyu', u'\u304e\u3087': 'gyo', u'\u3075': 'fu', u'\u307d': 'po'}
LATN_TO_HIRA = {
'a': '', 'i': '', 'u': '', 'e': '', 'o': '',
'ka': '', 'ki': '', 'ku': '', 'ke': '', 'ko': '',
'ga': '', 'gi': '', 'gu': '', 'ge': '', 'go': '',
'sa': '', 'si': '', 'shi': '', 'su': '', 'se': '', 'so': '',
'za': '', 'zi': '', 'ji': '', 'zu': '', 'ze': '', 'zo': '',
'ta': '', 'ti': '', 'chi': '', 'tu': '', 'tsu': '', 'te': '','to': '',
'da': '', 'di': '', 'du': '', 'dzu': '', 'de': '','do': '',
'na': '', 'ni': '', 'nu': '','ne': '','no': '',
'ha': '', 'hi': '', 'hu': '', 'fu': '', 'he': '','ho': '',
'ba': '', 'bi': '', 'bu': '','be': '','bo': '',
'pa': '', 'pi': '', 'pu': '','pe': '','po': '',
'ma': '', 'mi': '', 'mu': '','me': '','mo': '',
'ya': '', 'yu': '', 'yo': '',
'ra': '', 'ri': '', 'ru': '','re': '','ro': '',
'la': '', 'li': '', 'lu': '','le': '','lo': '',
'wa': '', 'wi': 'うぃ', 'we': 'うぇ', 'wo': '',
'wye': '', 'wyi': '', '-': '',
'n': '', 'nn': '', "n'": '',
'kya': 'きゃ', 'kyu': 'きゅ', 'kyo': 'きょ', 'kye': 'きぇ', 'kyi': 'きぃ',
'gya': 'ぎゃ', 'gyu': 'ぎゅ', 'gyo': 'ぎょ', 'gye': 'ぎぇ', 'gyi': 'ぎぃ',
'kwa': 'くぁ', 'kwi': 'くぃ', 'kwu': 'くぅ', 'kwe': 'くぇ', 'kwo': 'くぉ',
'gwa': 'ぐぁ', 'gwi': 'ぐぃ', 'gwu': 'ぐぅ', 'gwe': 'ぐぇ', 'gwo': 'ぐぉ',
'qwa': 'ぐぁ', 'gwi': 'ぐぃ', 'gwu': 'ぐぅ', 'gwe': 'ぐぇ', 'gwo': 'ぐぉ',
'sya': 'しゃ', 'syi': 'しぃ', 'syu': 'しゅ', 'sye': 'しぇ', 'syo': 'しょ',
'sha': 'しゃ','shu': 'しゅ', 'she': 'しぇ', 'sho': 'しょ',
'ja': 'じゃ','ju': 'じゅ', 'je': 'じぇ', 'jo': 'じょ',
'jya': 'じゃ', 'jyi': 'じぃ', 'jyu': 'じゅ', 'jye': 'じぇ', 'jyo': 'じょ',
'zya': 'じゃ', 'zyu': 'じゅ', 'zyo': 'じょ', 'zye': 'じぇ', 'zyi': 'じぃ',
'swa': 'すぁ', 'swi': 'すぃ', 'swu': 'すぅ', 'swe': 'すぇ', 'swo': 'すぉ',
'cha': 'ちゃ','chu': 'ちゅ', 'che': 'ちぇ', 'cho': 'ちょ',
'cya': 'ちゃ', 'cyi': 'ちぃ', 'cyu': 'ちゅ', 'cye': 'ちぇ', 'cyo': 'ちょ',
'tya': 'ちゃ', 'tyi': 'ちぃ', 'tyu': 'ちゅ', 'tye': 'ちぇ', 'tyo': 'ちょ',
'dya': 'ぢゃ', 'dyi': 'ぢぃ', 'dyu': 'ぢゅ', 'dye': 'ぢぇ', 'dyo': 'ぢょ',
'tsa': 'つぁ', 'tsi': 'つぃ','tse': 'つぇ', 'tso': 'つぉ',
'tha': 'てゃ', 'thi': 'てぃ', 'thu': 'てゅ', 'the': 'てぇ', 'tho': 'てょ',
'twa': 'とぁ', 'twi': 'とぃ', 'twu': 'とぅ', 'twe': 'とぇ', 'two': 'とぉ',
'dha': 'でゃ', 'dhi': 'でぃ', 'dhu': 'でゅ', 'dhe': 'でぇ', 'dho': 'でょ',
'dwa': 'どぁ', 'dwi': 'どぃ', 'dwu': 'どぅ', 'dwe': 'どぇ', 'dwo': 'どぉ',
'nya': 'にゃ', 'nyu': 'にゅ', 'nyo': 'にょ', 'nye': 'にぇ', 'nyi': 'にぃ',
'hya': 'ひゃ', 'hyi': 'ひぃ', 'hyu': 'ひゅ', 'hye': 'ひぇ', 'hyo': 'ひょ',
'bya': 'びゃ', 'byi': 'びぃ', 'byu': 'びゅ', 'bye': 'びぇ', 'byo': 'びょ',
'pya': 'ぴゃ', 'pyi': 'ぴぃ', 'pyu': 'ぴゅ', 'pye': 'ぴぇ', 'pyo': 'ぴょ',
'fa': 'ふぁ', 'fi': 'ふぃ','fe': 'ふぇ', 'fo': 'ふぉ',
'fwa': 'ふぁ', 'fwi': 'ふぃ', 'fwu': 'ふぅ', 'fwe': 'ふぇ', 'fwo': 'ふぉ',
'fya': 'ふゃ', 'fyi': 'ふぃ', 'fyu': 'ふゅ', 'fye': 'ふぇ', 'fyo': 'ふょ',
'mya': 'みゃ', 'myi': 'みぃ', 'myu': 'みゅ', 'mye': 'みぇ', 'myo': 'みょ',
'rya': 'りゃ', 'ryi': 'りぃ', 'ryu': 'りゅ', 'rye': 'りぇ', 'ryo': 'りょ',
'lya': 'りゃ', 'lyu': 'りゅ', 'lyo': 'りょ', 'lye': 'りぇ', 'lyi': 'りぃ',
'va': 'ゔぁ', 'vi': 'ゔぃ', 'vu': '','ve': 'ゔぇ', 'vo': 'ゔぉ',
'vya': 'ゔゃ', 'vyi': 'ゔぃ', 'vyu': 'ゔゅ', 'vye': 'ゔぇ', 'vyo': 'ゔょ',
'wha': 'うぁ', 'whi': 'うぃ', 'ye': 'いぇ', 'whe': 'うぇ', 'who': 'うぉ',
'xa': '', 'xi': '', 'xu': '', 'xe': '', 'xo': '',
'xya': '', 'xyu': '', 'xyo': '',
'xtu': '', 'xtsu': '',
'xka': '', 'xke': '', 'xwa': '',
'@@': ' ', '#[': '', '#]': '', '#,': '', '#.': '', '#/': '',
}

57
readme.md Normal file
View file

@ -0,0 +1,57 @@
jTransliterate - [Hirag/Katak]ana to Latin/English & Back
===========================================================================
Sometimes you may want to convert from Hiragana to Katakana, or back again, or...
I dunno, maybe you wanna get the English pronunciation of these words. I'll
be honest and say it's of no concern or interest to me, but I needed this in
Python and so I ported it, figured I'd release it.
It's MIT licensed. Credit for much of this also belongs to Kim Ahlström and
his linguistics/etc work on **[Ve](https://github.com/Kimtaro/ve/blob/master/lib/providers/japanese_transliterators.rb)**.
Installation
---------------------------------------------------------------------------
pip install jTransliterate
Examples && Documentation
---------------------------------------------------------------------------
``` python
# -*- coding: utf-8 -*-
from jTransliterate import JapaneseTransliterator
# Transliterate from Latin/English to [Hirag/Katak]ana
x = JapaneseTransliterator('kanazawa')
print x.transliterate_from_latn_to_hrkt()
# Should print "かなざわ"
# Transliterate from Hiragana to Latin/English
b = JapaneseTransliterator('かなざわ')
print b.transliterate_from_hira_to_latn()
# Should print "kanazawa"
# Transliterate from either Hiragana or Katakana to Latin/English
print b.transliterate_from_hrkt_to_latn(text = 'カナザワ')
# Should print "kanazawa"
# Transliterate from Katakan to Hiragana (You... probably never need to do this)
print b.transliterate_from_kana_to_hira(text = 'キットカート')
# Should print "きっとかーと"
# Transliterate from Hiragana to Katakana
print b.transliterate_from_hira_to_kana(text = 'かなざわ')
# Should print "カナザワ"
# If you want to convert between half/full width kana, you can use the following
# functions. I didn't care enough to do demos here. ;|
b.transliterate_from_halfwidth_to_fullwidth()
b.transliterate_from_fullwidth_to_halfwidth()
```
Questions, Comments, Complaints and/or etc
---------------------------------------------------------------------------
Hit me up on them Twitters or find me on them internets at the links below.
Twitter: **[@ryanmcgrath](http://twitter.com/ryanmcgrath/)**
Web: **[Veno Designs](http://venodesigns.net/)**

34
setup.py Normal file
View file

@ -0,0 +1,34 @@
#!/usr/bin/env python
from setuptools import setup
from setuptools import find_packages
__author__ = 'Ryan McGrath <ryan@venodesigns.net>'
__version__ = '1.0.0'
setup(
# Basic package information.
name='jTransliterate',
version=__version__,
packages=find_packages(),
# Packaging options.
include_package_data=True,
# Metadata for PyPI.
author='Ryan McGrath',
author_email='ryan@venodesigns.net',
license='MIT License',
url='http://github.com/ryanmcgrath/twython/tree/master',
keywords='japanese translation transliterate katakana hiragana latin',
description='Transliterate [Hirag/Katak]ana to Latin/English and back. Convert half/full-width Japanese text.',
long_description=open('readme.md').read(),
classifiers=[
'Development Status :: 4 - Beta',
'Intended Audience :: Developers',
'License :: OSI Approved :: MIT License',
'Topic :: Software Development :: Libraries :: Python Modules',
'Topic :: Communications :: Chat',
'Topic :: Internet'
]
)