Initial, etc
This commit is contained in:
commit
4e04fbe997
6 changed files with 524 additions and 0 deletions
21
LICENSE
Normal file
21
LICENSE
Normal file
|
|
@ -0,0 +1,21 @@
|
||||||
|
The MIT License
|
||||||
|
|
||||||
|
Copyright (c) 2009 - 2010 Ryan McGrath
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
in the Software without restriction, including without limitation the rights
|
||||||
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
copies of the Software, and to permit persons to whom the Software is
|
||||||
|
furnished to do so, subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in
|
||||||
|
all copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||||
|
THE SOFTWARE.
|
||||||
254
jTransliterate/__init__.py
Normal file
254
jTransliterate/__init__.py
Normal file
|
|
@ -0,0 +1,254 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
#!/usr/bin/python
|
||||||
|
|
||||||
|
__author__ = "Ryan McGrath <ryan@venodesigns.net>"
|
||||||
|
__version__ = "1.0"
|
||||||
|
|
||||||
|
"""
|
||||||
|
A class that allows for easy transliteration of [Hirag/Katak]ana
|
||||||
|
and English/Latin. Much of the work here is inspired/inherited/etc
|
||||||
|
from Kim Ahlström and his work on "Ve", built in Ruby.
|
||||||
|
|
||||||
|
Credit where credit is due:
|
||||||
|
https://github.com/Kimtaro/ve/blob/master/lib/providers/japanese_transliterators.rb
|
||||||
|
"""
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
# Lookup tables for character conversions. Much of this is borrowed from the work of
|
||||||
|
# Kim Ahlström and Ve: https://github.com/Kimtaro/ve/
|
||||||
|
#
|
||||||
|
# Ve's Transliterators are written in Ruby, and I wanted Python. Consider it a nice port. ;)
|
||||||
|
from translation_maps import H_SYLLABIC_N, H_SMALL_TSU, HIRA_TO_LATN, LATN_TO_HIRA
|
||||||
|
|
||||||
|
def defaultToSelfText(fn):
|
||||||
|
"""
|
||||||
|
A fun little decorator that makes it so we can default to
|
||||||
|
the text stored on a class instance, but also let people just
|
||||||
|
instantiate and re-use calls while supplying new text. Whee.
|
||||||
|
"""
|
||||||
|
def wrapper(self, text = None):
|
||||||
|
if text is None:
|
||||||
|
text = self.text
|
||||||
|
|
||||||
|
return fn(self, text = text)
|
||||||
|
|
||||||
|
return wrapper
|
||||||
|
|
||||||
|
class JapaneseTransliterator(object):
|
||||||
|
def __init__(self, text):
|
||||||
|
"""__init__(self, text)
|
||||||
|
|
||||||
|
JapaneseTransliterator("fadjfnjsfnjsafnjsdnf")
|
||||||
|
|
||||||
|
I envisioned storing the original text on the instantiated object
|
||||||
|
itself, and allowing it to be overridden on a per-function-call basis.
|
||||||
|
|
||||||
|
So I did.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
text - Text to be operated on. Unicode please!
|
||||||
|
"""
|
||||||
|
self.text = text
|
||||||
|
|
||||||
|
@defaultToSelfText
|
||||||
|
def transliterate_from_hrkt_to_latn(self, text):
|
||||||
|
"""transliterate_from_hrkt_to_latn(self, text)
|
||||||
|
|
||||||
|
Transliterates from [Hirag/Katak]ana to Latin/En.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
text - Optional. Use different text than what's on
|
||||||
|
the class instance.
|
||||||
|
"""
|
||||||
|
text = self.transliterate_from_kana_to_hira(text)
|
||||||
|
return self.transliterate_from_hira_to_latn(text)
|
||||||
|
|
||||||
|
@defaultToSelfText
|
||||||
|
def transliterate_from_hira_to_latn(self, text):
|
||||||
|
"""transliterate_from_hira_to_latn(self, text)
|
||||||
|
|
||||||
|
Transliterates from Hiragana to Latin/En. Phonetics, that is.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
text - Optional. Use different text than what's on
|
||||||
|
the class instance.
|
||||||
|
"""
|
||||||
|
# Decode once, not twice
|
||||||
|
_H_SMALL_TSU = H_SMALL_TSU.decode('utf-8')
|
||||||
|
_H_SYLLABIC_N = H_SYLLABIC_N.decode('utf-8')
|
||||||
|
|
||||||
|
kana = (text * 1).decode('utf-8')
|
||||||
|
romaji = ''
|
||||||
|
geminate = False
|
||||||
|
|
||||||
|
index = 0
|
||||||
|
klength = len(kana)
|
||||||
|
|
||||||
|
while klength > 0:
|
||||||
|
for length in [2, 1]:
|
||||||
|
mora = ''
|
||||||
|
for_conversion = kana[index:(index + length)]
|
||||||
|
|
||||||
|
if for_conversion == _H_SMALL_TSU:
|
||||||
|
geminate = True
|
||||||
|
index += length
|
||||||
|
klength -= length
|
||||||
|
break
|
||||||
|
|
||||||
|
elif for_conversion == _H_SYLLABIC_N and re.match(u'[\u3084-\u3088]', kana[(index + 1):(index + 2)]):
|
||||||
|
# Syllabic N before ya, yu or yo
|
||||||
|
mora = "n'"
|
||||||
|
elif for_conversion in HIRA_TO_LATN:
|
||||||
|
mora = HIRA_TO_LATN[for_conversion]
|
||||||
|
|
||||||
|
if len(mora) > 0:
|
||||||
|
if geminate:
|
||||||
|
geminate = False
|
||||||
|
romaji += mora[index:index + 1]
|
||||||
|
|
||||||
|
romaji += mora
|
||||||
|
index += length
|
||||||
|
klength -= length
|
||||||
|
break
|
||||||
|
elif length == 1:
|
||||||
|
romaji += for_conversion
|
||||||
|
index += length
|
||||||
|
klength -= length
|
||||||
|
|
||||||
|
return romaji
|
||||||
|
|
||||||
|
@defaultToSelfText
|
||||||
|
def transliterate_from_latn_to_hrkt(self, text):
|
||||||
|
"""transliterate_from_latn_to_hrkt(self, text)
|
||||||
|
|
||||||
|
Transliterates from Latin/En to Hiragana (mostly).
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
text - Optional. Use different text than what's on
|
||||||
|
the class instance.
|
||||||
|
"""
|
||||||
|
# Duplicate the text...
|
||||||
|
romaji = text * 1
|
||||||
|
kana = ''
|
||||||
|
|
||||||
|
romaji = re.sub('/m([BbPp])/', 'n\1', romaji)
|
||||||
|
romaji = re.sub('/M([BbPp])/', 'N\1', romaji)
|
||||||
|
|
||||||
|
index = 0
|
||||||
|
rlength = len(romaji) - 1
|
||||||
|
|
||||||
|
while rlength > 0:
|
||||||
|
for for_removal in [3, 2, 1]:
|
||||||
|
mora = ''
|
||||||
|
for_conversion = romaji[index:(index + for_removal)]
|
||||||
|
is_upper = True if re.search('[A-Z][^A-Z]*', for_conversion) else False
|
||||||
|
for_conversion = for_conversion.lower()
|
||||||
|
|
||||||
|
if re.match('/nn[aiueo]/', for_conversion):
|
||||||
|
mora = H_SYLLABIC_N
|
||||||
|
for_removal = 1
|
||||||
|
elif for_conversion in LATN_TO_HIRA:
|
||||||
|
mora = LATN_TO_HIRA[for_conversion]
|
||||||
|
elif for_conversion == 'tch' or (for_removal == 2 and re.match('/([kgsztdnbpmyrlwc])\1/', for_conversion)):
|
||||||
|
mora = H_SMALL_TSU
|
||||||
|
for_removal = 1
|
||||||
|
|
||||||
|
if mora != '':
|
||||||
|
if is_upper:
|
||||||
|
kana += self.transliterate_from_hira_to_kana(text = (mora * 1))
|
||||||
|
else:
|
||||||
|
kana += mora
|
||||||
|
|
||||||
|
index += for_removal
|
||||||
|
rlength -= for_removal
|
||||||
|
break
|
||||||
|
elif for_removal == 1:
|
||||||
|
kana += for_conversion
|
||||||
|
index += 1
|
||||||
|
rlength -= 1
|
||||||
|
|
||||||
|
return kana
|
||||||
|
|
||||||
|
@defaultToSelfText
|
||||||
|
def transliterate_from_kana_to_hira(self, text):
|
||||||
|
"""transliterate_from_kana_to_hira(self, text)
|
||||||
|
|
||||||
|
Transliterates from Katakana to Hiragana.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
text - Optional. Use different text than what's on
|
||||||
|
the class instance.
|
||||||
|
"""
|
||||||
|
return JapaneseTransliterator.transpose_codepoints_in_range(text, -96, 12449, 12534)
|
||||||
|
|
||||||
|
@defaultToSelfText
|
||||||
|
def transliterate_from_hira_to_kana(self, text):
|
||||||
|
"""transliterate_from_hira_to_kana(self, text)
|
||||||
|
|
||||||
|
Transliterates from Hiragana to Katakana.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
text - Optional. Use different text than what's on
|
||||||
|
the class instance.
|
||||||
|
"""
|
||||||
|
return JapaneseTransliterator.transpose_codepoints_in_range(text, 96, 12353, 12438)
|
||||||
|
|
||||||
|
@defaultToSelfText
|
||||||
|
def transliterate_from_fullwidth_to_halfwidth(self, text):
|
||||||
|
"""transliterate_from_fullwidth_to_halfwidth(self, text)
|
||||||
|
|
||||||
|
Transliterates from full-width to half-width.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
text - Optional. Use different text than what's on
|
||||||
|
the class instance.
|
||||||
|
"""
|
||||||
|
text = JapaneseTransliterator.transpose_codepoints_in_range(text, -65248, 65281, 65374)
|
||||||
|
return JapaneseTransliterator.transpose_codepoints_in_range(text, -12256, 12288, 12288)
|
||||||
|
|
||||||
|
@defaultToSelfText
|
||||||
|
def transliterate_from_halfwidth_to_fullwidth(self, text):
|
||||||
|
"""transliterate_from_fullwidth_to_halfwidth(self, text)
|
||||||
|
|
||||||
|
Transliterates from half-width to full-width.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
text - Optional. Use different text than what's on
|
||||||
|
the class instance.
|
||||||
|
"""
|
||||||
|
text = JapaneseTransliterator.transpose_codepoints_in_range(text, 65248, 33, 126)
|
||||||
|
return JapaneseTransliterator.transpose_codepoints_in_range(text, 12256, 32, 32)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def transpose_codepoints_in_range(text, distance, range_start, range_end):
|
||||||
|
"""JapaneseTransliterator.transpose_codepoints_in_range(text, distance, range_start, range_end)
|
||||||
|
|
||||||
|
Given a set of text (unicode...), coupled with distance and range, transposes
|
||||||
|
it for a corresponding swap and returns the new set.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
text - text to be transposed, codepoint-wise
|
||||||
|
distance - to the other side of the map
|
||||||
|
range_start - start of the range we're interested in, codepont-wise
|
||||||
|
range_end - end of the range we're interested in, codepoint-wise
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
string, text, etc
|
||||||
|
"""
|
||||||
|
if not isinstance(text, unicode):
|
||||||
|
# Python will raise a UnicodeEncodeError here if there are any
|
||||||
|
# outstanding issues, otherwise things should be fine. *shrug*
|
||||||
|
text = unicode(text, 'utf-8')
|
||||||
|
|
||||||
|
transposed_text = u''
|
||||||
|
codepoints = map(lambda char: ord(char), list(text))
|
||||||
|
|
||||||
|
for codepoint in codepoints:
|
||||||
|
print codepoint
|
||||||
|
if codepoint >= range_start and codepoint <= range_end:
|
||||||
|
transposed_text += unichr(codepoint + distance)
|
||||||
|
else:
|
||||||
|
transposed_text += unichr(codepoint)
|
||||||
|
|
||||||
|
return transposed_text
|
||||||
30
jTransliterate/test.py
Normal file
30
jTransliterate/test.py
Normal file
|
|
@ -0,0 +1,30 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
from __init__ import JapaneseTransliterator
|
||||||
|
|
||||||
|
# Transliterate from Latin/English to [Hirag/Katak]ana
|
||||||
|
x = JapaneseTransliterator('kanazawa')
|
||||||
|
print x.transliterate_from_latn_to_hrkt()
|
||||||
|
# Should print "かなざわ"
|
||||||
|
|
||||||
|
# Transliterate from Hiragana to Latin/English
|
||||||
|
b = JapaneseTransliterator('かなざわ')
|
||||||
|
print b.transliterate_from_hira_to_latn()
|
||||||
|
# Should print "kanazawa"
|
||||||
|
|
||||||
|
# Transliterate from either Hiragana or Katakana to Latin/English
|
||||||
|
print b.transliterate_from_hrkt_to_latn(text = 'カナザワ')
|
||||||
|
# Should print "kanazawa"
|
||||||
|
|
||||||
|
# Transliterate from Katakan to Hiragana (You... probably never need to do this)
|
||||||
|
print b.transliterate_from_kana_to_hira(text = 'キットカート')
|
||||||
|
# Should print "きっとかーと"
|
||||||
|
|
||||||
|
# Transliterate from Hiragana to Katakana
|
||||||
|
print b.transliterate_from_hira_to_kana(text = 'かなざわ')
|
||||||
|
# Should print "カナザワ"
|
||||||
|
|
||||||
|
# If you want to convert between half/full width kana, you can use the following
|
||||||
|
# functions. I didn't care enough to do demos here. ;|
|
||||||
|
b.transliterate_from_halfwidth_to_fullwidth()
|
||||||
|
b.transliterate_from_fullwidth_to_halfwidth()
|
||||||
128
jTransliterate/translation_maps.py
Normal file
128
jTransliterate/translation_maps.py
Normal file
|
|
@ -0,0 +1,128 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
H_SYLLABIC_N= 'ん'
|
||||||
|
H_SMALL_TSU = 'っ'
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
Python sucks with regards to unicode-fun, but I'm leaving this here as
|
||||||
|
a fun reference for anyone deciphering all this. Enjoy. -- Ryan
|
||||||
|
|
||||||
|
HIRA_TO_LATN = {
|
||||||
|
"あ":"a", "い":"i", "う":"", "え":"e", "お":"o",
|
||||||
|
"か":"ka", "き":"ki", "く":"k", "け":"ke", "こ":"ko",
|
||||||
|
"が":"ga", "ぎ":"gi", "ぐ":"g", "げ":"ge", "ご":"go",
|
||||||
|
"さ":"sa", "し":"shi", "す":"s", "せ":"se", "そ":"so",
|
||||||
|
"ざ":"za", "じ":"ji", "ず":"z", "ぜ":"ze", "ぞ":"zo",
|
||||||
|
"た":"ta", "ち":"chi", "つ":"ts", "て":"te", "と":"to",
|
||||||
|
"だ":"da", "ぢ":"ji", "づ":"z", "で":"de", "ど":"do",
|
||||||
|
"な":"na", "に":"ni", "ぬ":"n", "ね":"ne", "の":"no",
|
||||||
|
"は":"ha", "ひ":"hi", "ふ":"f", "へ":"he", "ほ":"ho",
|
||||||
|
"ば":"ba", "び":"bi", "ぶ":"b", "べ":"be", "ぼ":"bo",
|
||||||
|
"ぱ":"pa", "ぴ":"pi", "ぷ":"p", "ぺ":"pe", "ぽ":"po",
|
||||||
|
"ま":"ma", "み":"mi", "む":"m", "め":"me", "も":"mo",
|
||||||
|
"や":"ya", "ゆ":"y", "よ":"yo",
|
||||||
|
"ら":"ra", "り":"ri", "る":"r", "れ":"re", "ろ":"ro",
|
||||||
|
"わ":"wa", "うぃ":"whi", "うぇ":"whe", "を":"wo",
|
||||||
|
"ゑ":"wye", "ゐ":"wyi", "ー":"-", "ん":"n",
|
||||||
|
|
||||||
|
"きゃ":"kya", "きゅ":"ky", "きょ":"kyo", "きぇ":"kye", "きぃ":"kyi",
|
||||||
|
"ぎゃ":"gya", "ぎゅ":"gy", "ぎょ":"gyo", "ぎぇ":"gye", "ぎぃ":"gyi",
|
||||||
|
"くぁ":"kwa", "くぃ":"kwi", "くぅ":"kw", "くぇ":"kwe", "くぉ":"kwo",
|
||||||
|
"ぐぁ":"qwa", "ぐぃ":"gwi", "ぐぅ":"gw", "ぐぇ":"gwe", "ぐぉ":"gwo",
|
||||||
|
"しゃ":"sha", "しぃ":"syi", "しゅ":"sh", "しぇ":"she", "しょ":"sho",
|
||||||
|
"じゃ":"jya", "じゅ":"zy", "じぇ":"zye", "じょ":"zyo", "じぃ":"zyi",
|
||||||
|
"すぁ":"swa", "すぃ":"swi", "すぅ":"sw", "すぇ":"swe", "すぉ":"swo",
|
||||||
|
"ちゃ":"tya", "ちゅ":"ty", "ちぇ":"tye", "ちょ":"tyo", "ちぃ":"tyi",
|
||||||
|
"ぢゃ":"dya", "ぢぃ":"dyi", "ぢゅ":"dy", "ぢぇ":"dye", "ぢょ":"dyo",
|
||||||
|
"つぁ":"tsa", "つぃ":"tsi", "つぇ":"tse", "つぉ":"tso", "てゃ":"tha",
|
||||||
|
"てぃ":"thi", "てゅ":"th", "てぇ":"the", "てょ":"tho", "とぁ":"twa",
|
||||||
|
"とぃ":"twi", "とぅ":"tw", "とぇ":"twe", "とぉ":"two", "でゃ":"dha",
|
||||||
|
"でぃ":"dhi", "でゅ":"dh", "でぇ":"dhe", "でょ":"dho", "どぁ":"dwa",
|
||||||
|
"どぃ":"dwi", "どぅ":"dw", "どぇ":"dwe", "どぉ":"dwo", "にゃ":"nya",
|
||||||
|
"にゅ":"ny", "にょ":"nyo", "にぇ":"nye", "にぃ":"nyi", "ひゃ":"hya",
|
||||||
|
"ひぃ":"hyi", "ひゅ":"hy", "ひぇ":"hye", "ひょ":"hyo", "びゃ":"bya",
|
||||||
|
"びぃ":"byi", "びゅ":"by", "びぇ":"bye", "びょ":"byo", "ぴゃ":"pya",
|
||||||
|
"ぴぃ":"pyi", "ぴゅ":"py", "ぴぇ":"pye", "ぴょ":"pyo", "ふぁ":"fwa",
|
||||||
|
"ふぃ":"fyi", "ふぇ":"fye", "ふぉ":"fwo", "ふぅ":"fw", "ふゃ":"fya",
|
||||||
|
"ふゅ":"fy", "ふょ":"fyo", "みゃ":"mya", "みぃ":"myi", "みゅ":"my",
|
||||||
|
"みぇ":"mye", "みょ":"myo", "りゃ":"rya", "りぃ":"ryi", "りゅ":"ry",
|
||||||
|
"りぇ":"rye", "りょ":"ryo",
|
||||||
|
"ゔぁ":"va", "ゔぃ":"vyi", "ゔ":"v", "ゔぇ":"vye", "ゔぉ":"vo",
|
||||||
|
"ゔゃ":"vya", "ゔゅ":"vy", "ゔょ":"vyo",
|
||||||
|
"うぁ":"wha", "いぇ":"ye", "うぉ":"who",
|
||||||
|
"ぁ":"xa", "ぃ":"xi", "ぅ":"x", "ぇ":"xe", "ぉ":"xo",
|
||||||
|
"ゕ":"xka", "ゖ":"xke", "ゎ":"xwa"
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
HIRA_TO_LATN = {u'\u3057\u3047': 'she', u'\u3057\u3043': 'syi', u'\u308b': 'ru', u'\u3093': 'n', u'\u3074\u3047': 'pye', u'\u3074\u3043': 'pyi', u'\u304d\u3085': 'kyu', u'\u304d\u3087': 'kyo', u'\u304d\u3083': 'kya', u'\u3067\u3047': 'dhe', u'\u3050\u3041': 'qwa', u'\u3067\u3043': 'dhi', u'\u3094\u3087': 'vyo', u'\u308a\u3043': 'ryi', u'\u3094\u3083': 'vya', u'\u3048': 'e', u'\u3050': 'gu', u'\u3058': 'ji', u'\u3060': 'da', u'\u3064\u3049': 'tso', u'\u3064\u3047': 'tse', u'\u3064\u3043': 'tsi', u'\u3064\u3041': 'tsa', u'\u3070': 'ba', u'\u3078': 'he', u'\u3080': 'mu', u'\u3088': 'yo', u'\u3043': 'xi', u'\u3090': 'wyi', u'\u3050\u3043': 'gwi', u'\u3072': 'hi', u'\u3050\u3047': 'gwe', u'\u3050\u3045': 'gwu', u'\u3050\u3049': 'gwo', u'\u3057\u3087': 'sho', u'\u3057\u3085': 'shu', u'\u3057\u3083': 'sha', u'\u304b': 'ka', u'\u3053': 'ko', u'\u3074\u3087': 'pyo', u'\u305b': 'se', u'\u3074\u3085': 'pyu', u'\u3074\u3083': 'pya', u'\u304d\u3047': 'kye', u'\u3068\u3041': 'twa', u'\u304d\u3043': 'kyi', u'\u306b': 'ni', u'\u3067\u3087': 'dho', u'\u3067\u3085': 'dhu', u'\u3067\u3083': 'dha', u'\u3094\u3049': 'vo', u'\u3094\u3047': 'vye', u'\u307b': 'ho', u'\u3094\u3043': 'vyi', u'\u3094\u3041': 'va', u'\u3081': 'me', u'\u3089': 'ra', u'\u3091': 'wye', u'\u3046\u3041': 'wha', u'\u3046\u3043': 'whi', u'\u3046\u3047': 'whe', u'\u3073\u3083': 'bya', u'\u3046\u3049': 'who', u'\u3073\u3087': 'byo', u'\u3073\u3085': 'byu', u'\u3066\u3083': 'tha', u'\u3066\u3085': 'thu', u'\u3066\u3087': 'tho', u'\u3046': 'u', u'\u304e': 'gi', u'\u3056': 'za', u'\u308a\u3047': 'rye', u'\u305e': 'zo', u'\u3094\u3085': 'vyu', u'\u3066': 'te', u'\u306e': 'no', u'\u3076': 'bu', u'\u307e': 'ma', u'\u3059\u3049': 'swo', u'\u3086': 'yu', u'\u3059\u3041': 'swa', u'\u3059\u3043': 'swi', u'\u3059\u3045': 'swu', u'\u3059\u3047': 'swe', u'\u308e': 'xwa', u'\u3096': 'xke', u'\u308a\u3085': 'ryu', u'\u308a\u3087': 'ryo', u'\u308a\u3083': 'rya', u'\u3073': 'bi', u'\u3069\u3049': 'dwo', u'\u3069\u3041': 'dwa', u'\u3069\u3043': 'dwi', u'\u3069\u3045': 'dwu', u'\u3069\u3047': 'dwe', u'\u3041': 'xa', u'\u3049': 'xo', u'\u3051': 'ke', u'\u3073\u3043': 'byi', u'\u3073\u3047': 'bye', u'\u3061': 'chi', u'\u3069': 'do', u'\u3071': 'pa', u'\u3066\u3043': 'thi', u'\u3066\u3047': 'the', u'\u3079': 'be', u'\u308f': 'wa', u'\u3062\u3085': 'dyu', u'\u3062\u3087': 'dyo', u'\u3062\u3083': 'dya', u'\u307f\u3087': 'myo', u'\u307f\u3085': 'myu', u'\u307f\u3083': 'mya', u'\u3044': 'i', u'\u304c': 'ga', u'\u3072\u3085': 'hyu', u'\u3072\u3087': 'hyo', u'\u3054': 'go', u'\u3072\u3083': 'hya', u'\u305c': 'ze', u'\u3064': 'tsu', u'\u304f\u3049': 'kwo', u'\u304f\u3047': 'kwe', u'\u304f\u3045': 'kwu', u'\u304f\u3043': 'kwi', u'\u306c': 'nu', u'\u304f\u3041': 'kwa', u'\u3074': 'pi', u'\u3068': 'to', u'\u307c': 'bo', u'\u3084': 'ya', u'\u308c': 're', u'\u3072\u3047': 'hye', u'\u3094': 'vu', u'\u3072\u3043': 'hyi', u'\u3045': 'xu', u'\u3047': 'xe', u'\u304f': 'ku', u'\u3057': 'shi', u'\u305f': 'ta', u'\u3062\u3047': 'dye', u'\u3067': 'de', u'\u3062\u3043': 'dyi', u'\u306f': 'ha', u'\u3077': 'pu', u'\u307f\u3047': 'mye', u'\u307f\u3043': 'myi', u'\u30fc': '-', u'\u307f': 'mi', u'\u306b\u3083': 'nya', u'\u306b\u3087': 'nyo', u'\u306b\u3085': 'nyu', u'\u308d': 'ro', u'\u3059': 'su', u'\u3095': 'xka', u'\u304e\u3043': 'gyi', u'\u304e\u3047': 'gye', u'\u3042': 'a', u'\u3058\u3043': 'zyi', u'\u304a': 'o', u'\u3058\u3047': 'zye', u'\u3052': 'ge', u'\u3075\u3049': 'fwo', u'\u3075\u3045': 'fwu', u'\u3075\u3047': 'fye', u'\u305a': 'zu', u'\u3075\u3041': 'fwa', u'\u3075\u3043': 'fyi', u'\u3061\u3083': 'tya', u'\u3062': 'ji', u'\u3061\u3085': 'tyu', u'\u3061\u3087': 'tyo', u'\u306a': 'na', u'\u3044\u3047': 'ye', u'\u3068\u3049': 'two', u'\u3068\u3043': 'twi', u'\u307a': 'pe', u'\u3068\u3047': 'twe', u'\u3068\u3045': 'twu', u'\u3082': 'mo', u'\u3058\u3083': 'jya', u'\u308a': 'ri', u'\u3058\u3087': 'zyo', u'\u3058\u3085': 'zyu', u'\u3092': 'wo', u'\u3075\u3085': 'fyu', u'\u3075\u3087': 'fyo', u'\u3075\u3083': 'fya', u'\u3061\u3043': 'tyi', u'\u3061\u3047': 'tye', u'\u306b\u3043': 'nyi', u'\u306b\u3047': 'nye', u'\u304d': 'ki', u'\u3055': 'sa', u'\u305d': 'so', u'\u3065': 'zu', u'\u304e\u3083': 'gya', u'\u306d': 'ne', u'\u304e\u3085': 'gyu', u'\u304e\u3087': 'gyo', u'\u3075': 'fu', u'\u307d': 'po'}
|
||||||
|
|
||||||
|
LATN_TO_HIRA = {
|
||||||
|
'a': 'あ', 'i': 'い', 'u': 'う', 'e': 'え', 'o': 'お',
|
||||||
|
'ka': 'か', 'ki': 'き', 'ku': 'く', 'ke': 'け', 'ko': 'こ',
|
||||||
|
'ga': 'が', 'gi': 'ぎ', 'gu': 'ぐ', 'ge': 'げ', 'go': 'ご',
|
||||||
|
'sa': 'さ', 'si': 'し', 'shi': 'し', 'su': 'す', 'se': 'せ', 'so': 'そ',
|
||||||
|
'za': 'ざ', 'zi': 'じ', 'ji': 'じ', 'zu': 'ず', 'ze': 'ぜ', 'zo': 'ぞ',
|
||||||
|
'ta': 'た', 'ti': 'ち', 'chi': 'ち', 'tu': 'つ', 'tsu': 'つ', 'te': 'て','to': 'と',
|
||||||
|
'da': 'だ', 'di': 'ぢ', 'du': 'づ', 'dzu': 'づ', 'de': 'で','do': 'ど',
|
||||||
|
'na': 'な', 'ni': 'に', 'nu': 'ぬ','ne': 'ね','no': 'の',
|
||||||
|
'ha': 'は', 'hi': 'ひ', 'hu': 'ふ', 'fu': 'ふ', 'he': 'へ','ho': 'ほ',
|
||||||
|
'ba': 'ば', 'bi': 'び', 'bu': 'ぶ','be': 'べ','bo': 'ぼ',
|
||||||
|
'pa': 'ぱ', 'pi': 'ぴ', 'pu': 'ぷ','pe': 'ぺ','po': 'ぽ',
|
||||||
|
'ma': 'ま', 'mi': 'み', 'mu': 'む','me': 'め','mo': 'も',
|
||||||
|
'ya': 'や', 'yu': 'ゆ', 'yo': 'よ',
|
||||||
|
'ra': 'ら', 'ri': 'り', 'ru': 'る','re': 'れ','ro': 'ろ',
|
||||||
|
'la': 'ら', 'li': 'り', 'lu': 'る','le': 'れ','lo': 'ろ',
|
||||||
|
'wa': 'わ', 'wi': 'うぃ', 'we': 'うぇ', 'wo': 'を',
|
||||||
|
'wye': 'ゑ', 'wyi': 'ゐ', '-': 'ー',
|
||||||
|
|
||||||
|
'n': 'ん', 'nn': 'ん', "n'": 'ん',
|
||||||
|
|
||||||
|
'kya': 'きゃ', 'kyu': 'きゅ', 'kyo': 'きょ', 'kye': 'きぇ', 'kyi': 'きぃ',
|
||||||
|
'gya': 'ぎゃ', 'gyu': 'ぎゅ', 'gyo': 'ぎょ', 'gye': 'ぎぇ', 'gyi': 'ぎぃ',
|
||||||
|
'kwa': 'くぁ', 'kwi': 'くぃ', 'kwu': 'くぅ', 'kwe': 'くぇ', 'kwo': 'くぉ',
|
||||||
|
'gwa': 'ぐぁ', 'gwi': 'ぐぃ', 'gwu': 'ぐぅ', 'gwe': 'ぐぇ', 'gwo': 'ぐぉ',
|
||||||
|
'qwa': 'ぐぁ', 'gwi': 'ぐぃ', 'gwu': 'ぐぅ', 'gwe': 'ぐぇ', 'gwo': 'ぐぉ',
|
||||||
|
|
||||||
|
'sya': 'しゃ', 'syi': 'しぃ', 'syu': 'しゅ', 'sye': 'しぇ', 'syo': 'しょ',
|
||||||
|
'sha': 'しゃ','shu': 'しゅ', 'she': 'しぇ', 'sho': 'しょ',
|
||||||
|
'ja': 'じゃ','ju': 'じゅ', 'je': 'じぇ', 'jo': 'じょ',
|
||||||
|
'jya': 'じゃ', 'jyi': 'じぃ', 'jyu': 'じゅ', 'jye': 'じぇ', 'jyo': 'じょ',
|
||||||
|
'zya': 'じゃ', 'zyu': 'じゅ', 'zyo': 'じょ', 'zye': 'じぇ', 'zyi': 'じぃ',
|
||||||
|
'swa': 'すぁ', 'swi': 'すぃ', 'swu': 'すぅ', 'swe': 'すぇ', 'swo': 'すぉ',
|
||||||
|
|
||||||
|
'cha': 'ちゃ','chu': 'ちゅ', 'che': 'ちぇ', 'cho': 'ちょ',
|
||||||
|
'cya': 'ちゃ', 'cyi': 'ちぃ', 'cyu': 'ちゅ', 'cye': 'ちぇ', 'cyo': 'ちょ',
|
||||||
|
'tya': 'ちゃ', 'tyi': 'ちぃ', 'tyu': 'ちゅ', 'tye': 'ちぇ', 'tyo': 'ちょ',
|
||||||
|
'dya': 'ぢゃ', 'dyi': 'ぢぃ', 'dyu': 'ぢゅ', 'dye': 'ぢぇ', 'dyo': 'ぢょ',
|
||||||
|
'tsa': 'つぁ', 'tsi': 'つぃ','tse': 'つぇ', 'tso': 'つぉ',
|
||||||
|
'tha': 'てゃ', 'thi': 'てぃ', 'thu': 'てゅ', 'the': 'てぇ', 'tho': 'てょ',
|
||||||
|
'twa': 'とぁ', 'twi': 'とぃ', 'twu': 'とぅ', 'twe': 'とぇ', 'two': 'とぉ',
|
||||||
|
'dha': 'でゃ', 'dhi': 'でぃ', 'dhu': 'でゅ', 'dhe': 'でぇ', 'dho': 'でょ',
|
||||||
|
'dwa': 'どぁ', 'dwi': 'どぃ', 'dwu': 'どぅ', 'dwe': 'どぇ', 'dwo': 'どぉ',
|
||||||
|
|
||||||
|
'nya': 'にゃ', 'nyu': 'にゅ', 'nyo': 'にょ', 'nye': 'にぇ', 'nyi': 'にぃ',
|
||||||
|
|
||||||
|
'hya': 'ひゃ', 'hyi': 'ひぃ', 'hyu': 'ひゅ', 'hye': 'ひぇ', 'hyo': 'ひょ',
|
||||||
|
'bya': 'びゃ', 'byi': 'びぃ', 'byu': 'びゅ', 'bye': 'びぇ', 'byo': 'びょ',
|
||||||
|
'pya': 'ぴゃ', 'pyi': 'ぴぃ', 'pyu': 'ぴゅ', 'pye': 'ぴぇ', 'pyo': 'ぴょ',
|
||||||
|
'fa': 'ふぁ', 'fi': 'ふぃ','fe': 'ふぇ', 'fo': 'ふぉ',
|
||||||
|
'fwa': 'ふぁ', 'fwi': 'ふぃ', 'fwu': 'ふぅ', 'fwe': 'ふぇ', 'fwo': 'ふぉ',
|
||||||
|
'fya': 'ふゃ', 'fyi': 'ふぃ', 'fyu': 'ふゅ', 'fye': 'ふぇ', 'fyo': 'ふょ',
|
||||||
|
|
||||||
|
'mya': 'みゃ', 'myi': 'みぃ', 'myu': 'みゅ', 'mye': 'みぇ', 'myo': 'みょ',
|
||||||
|
|
||||||
|
'rya': 'りゃ', 'ryi': 'りぃ', 'ryu': 'りゅ', 'rye': 'りぇ', 'ryo': 'りょ',
|
||||||
|
'lya': 'りゃ', 'lyu': 'りゅ', 'lyo': 'りょ', 'lye': 'りぇ', 'lyi': 'りぃ',
|
||||||
|
|
||||||
|
'va': 'ゔぁ', 'vi': 'ゔぃ', 'vu': 'ゔ','ve': 'ゔぇ', 'vo': 'ゔぉ',
|
||||||
|
'vya': 'ゔゃ', 'vyi': 'ゔぃ', 'vyu': 'ゔゅ', 'vye': 'ゔぇ', 'vyo': 'ゔょ',
|
||||||
|
'wha': 'うぁ', 'whi': 'うぃ', 'ye': 'いぇ', 'whe': 'うぇ', 'who': 'うぉ',
|
||||||
|
|
||||||
|
'xa': 'ぁ', 'xi': 'ぃ', 'xu': 'ぅ', 'xe': 'ぇ', 'xo': 'ぉ',
|
||||||
|
'xya': 'ゃ', 'xyu': 'ゅ', 'xyo': 'ょ',
|
||||||
|
'xtu': 'っ', 'xtsu': 'っ',
|
||||||
|
'xka': 'ゕ', 'xke': 'ゖ', 'xwa': 'ゎ',
|
||||||
|
|
||||||
|
'@@': ' ', '#[': '「', '#]': '」', '#,': '、', '#.': '。', '#/': '・',
|
||||||
|
}
|
||||||
57
readme.md
Normal file
57
readme.md
Normal file
|
|
@ -0,0 +1,57 @@
|
||||||
|
jTransliterate - [Hirag/Katak]ana to Latin/English & Back
|
||||||
|
===========================================================================
|
||||||
|
Sometimes you may want to convert from Hiragana to Katakana, or back again, or...
|
||||||
|
I dunno, maybe you wanna get the English pronunciation of these words. I'll
|
||||||
|
be honest and say it's of no concern or interest to me, but I needed this in
|
||||||
|
Python and so I ported it, figured I'd release it.
|
||||||
|
|
||||||
|
It's MIT licensed. Credit for much of this also belongs to Kim Ahlström and
|
||||||
|
his linguistics/etc work on **[Ve](https://github.com/Kimtaro/ve/blob/master/lib/providers/japanese_transliterators.rb)**.
|
||||||
|
|
||||||
|
|
||||||
|
Installation
|
||||||
|
---------------------------------------------------------------------------
|
||||||
|
pip install jTransliterate
|
||||||
|
|
||||||
|
|
||||||
|
Examples && Documentation
|
||||||
|
---------------------------------------------------------------------------
|
||||||
|
``` python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
from jTransliterate import JapaneseTransliterator
|
||||||
|
|
||||||
|
# Transliterate from Latin/English to [Hirag/Katak]ana
|
||||||
|
x = JapaneseTransliterator('kanazawa')
|
||||||
|
print x.transliterate_from_latn_to_hrkt()
|
||||||
|
# Should print "かなざわ"
|
||||||
|
|
||||||
|
# Transliterate from Hiragana to Latin/English
|
||||||
|
b = JapaneseTransliterator('かなざわ')
|
||||||
|
print b.transliterate_from_hira_to_latn()
|
||||||
|
# Should print "kanazawa"
|
||||||
|
|
||||||
|
# Transliterate from either Hiragana or Katakana to Latin/English
|
||||||
|
print b.transliterate_from_hrkt_to_latn(text = 'カナザワ')
|
||||||
|
# Should print "kanazawa"
|
||||||
|
|
||||||
|
# Transliterate from Katakan to Hiragana (You... probably never need to do this)
|
||||||
|
print b.transliterate_from_kana_to_hira(text = 'キットカート')
|
||||||
|
# Should print "きっとかーと"
|
||||||
|
|
||||||
|
# Transliterate from Hiragana to Katakana
|
||||||
|
print b.transliterate_from_hira_to_kana(text = 'かなざわ')
|
||||||
|
# Should print "カナザワ"
|
||||||
|
|
||||||
|
# If you want to convert between half/full width kana, you can use the following
|
||||||
|
# functions. I didn't care enough to do demos here. ;|
|
||||||
|
b.transliterate_from_halfwidth_to_fullwidth()
|
||||||
|
b.transliterate_from_fullwidth_to_halfwidth()
|
||||||
|
```
|
||||||
|
|
||||||
|
Questions, Comments, Complaints and/or etc
|
||||||
|
---------------------------------------------------------------------------
|
||||||
|
Hit me up on them Twitters or find me on them internets at the links below.
|
||||||
|
|
||||||
|
Twitter: **[@ryanmcgrath](http://twitter.com/ryanmcgrath/)**
|
||||||
|
Web: **[Veno Designs](http://venodesigns.net/)**
|
||||||
34
setup.py
Normal file
34
setup.py
Normal file
|
|
@ -0,0 +1,34 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
from setuptools import setup
|
||||||
|
from setuptools import find_packages
|
||||||
|
|
||||||
|
__author__ = 'Ryan McGrath <ryan@venodesigns.net>'
|
||||||
|
__version__ = '1.0.0'
|
||||||
|
|
||||||
|
setup(
|
||||||
|
# Basic package information.
|
||||||
|
name='jTransliterate',
|
||||||
|
version=__version__,
|
||||||
|
packages=find_packages(),
|
||||||
|
|
||||||
|
# Packaging options.
|
||||||
|
include_package_data=True,
|
||||||
|
|
||||||
|
# Metadata for PyPI.
|
||||||
|
author='Ryan McGrath',
|
||||||
|
author_email='ryan@venodesigns.net',
|
||||||
|
license='MIT License',
|
||||||
|
url='http://github.com/ryanmcgrath/twython/tree/master',
|
||||||
|
keywords='japanese translation transliterate katakana hiragana latin',
|
||||||
|
description='Transliterate [Hirag/Katak]ana to Latin/English and back. Convert half/full-width Japanese text.',
|
||||||
|
long_description=open('readme.md').read(),
|
||||||
|
classifiers=[
|
||||||
|
'Development Status :: 4 - Beta',
|
||||||
|
'Intended Audience :: Developers',
|
||||||
|
'License :: OSI Approved :: MIT License',
|
||||||
|
'Topic :: Software Development :: Libraries :: Python Modules',
|
||||||
|
'Topic :: Communications :: Chat',
|
||||||
|
'Topic :: Internet'
|
||||||
|
]
|
||||||
|
)
|
||||||
Loading…
Add table
Add a link
Reference in a new issue