diff --git a/AGENTS.md b/AGENTS.md index 51f8c0da..60fb7b5a 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -46,6 +46,7 @@ This guide explains how automation agents and human contributors should work wit - **Full suite** – `make test-all` runs unit then integration tests sequentially. Set `RUST_LOG=debug` to inspect SQL queries during debugging. - **Linting and formatting** – Run `cargo fmt` and `cargo clippy --all-targets --all-features` before committing. Resolve new Clippy warnings unless an existing `#![allow]` covers the case. - **Data verification** – Execute `cargo run -p data_validator` whenever CSVs change and record results in pull requests. +- **IPA coverage audit** – Execute `make ipa-audit` when English or romanized CSV names change. This is a read-only report for `data/2!lines.csv`, `data/3!stations.csv`, and `data/4!types.csv`; it does not fail validation, but highlights unresolved tokens and example names so the IPA dictionary can be extended deliberately. ## gRPC Endpoint Overview - **Stations** – `GetStationById`, `GetStationByIdList`, `GetStationsByGroupId`, `GetStationsByCoordinates`, `GetStationsByLineId`, `GetStationsByName`, `GetStationsByLineGroupId`. `QueryInteractor` enriches stations with lines, companies, station numbers, and train types. diff --git a/Makefile b/Makefile index 271d5956..38ff2d92 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ # StationAPI Makefile # Cargoを使ったテスト実行のためのシンプルなタスク定義 -.PHONY: test test-unit test-integration test-all clean help +.PHONY: test test-unit test-integration test-all ipa-audit clean help # デフォルトターゲット help: @@ -9,6 +9,7 @@ help: @echo " test-unit - Run unit tests only (no database required)" @echo " test-integration - Run integration tests (requires PostgreSQL)" @echo " test-all - Run all tests" + @echo " ipa-audit - Print IPA coverage report for English/romanized CSV names" @echo " test - Alias for test-unit" @echo " clean - Clean build artifacts" @echo "" @@ -30,6 +31,11 @@ test-integration: # 全てのテストを実行 test-all: test-unit test-integration +ipa-audit: + @echo "Printing IPA coverage report..." + rustc tools/ipa_audit.rs -o /tmp/stationapi-ipa-audit + /tmp/stationapi-ipa-audit + # デフォルトはユニットテスト test: test-unit diff --git a/stationapi/proto b/stationapi/proto index f05b09f3..110eee9b 160000 --- a/stationapi/proto +++ b/stationapi/proto @@ -1 +1 @@ -Subproject commit f05b09f37213515a3a7d79d16d714b9c61984e5a +Subproject commit 110eee9b95578b15fdb7701f553a2892b1310d01 diff --git a/stationapi/src/domain/ipa.rs b/stationapi/src/domain/ipa.rs index 32abd4b9..9b206835 100644 --- a/stationapi/src/domain/ipa.rs +++ b/stationapi/src/domain/ipa.rs @@ -60,6 +60,626 @@ pub fn katakana_to_ipa(input: &str) -> Option { Some(apply_phonological_rules(&result)) } +/// Convert a station name to IPA. +/// Prefers the official romanized/English name when present so mixed names like +/// "Kasai-Rinkai Park" use English pronunciation for translated segments. +pub fn station_name_to_ipa(name_katakana: &str, name_roman: Option<&str>) -> Option { + name_roman + .map(str::trim) + .filter(|name| !name.is_empty()) + .and_then(romanized_name_to_ipa) + .filter(|ipa| !ipa.is_empty()) + .or_else(|| katakana_to_ipa(name_katakana)) + .filter(|ipa| !ipa.is_empty()) +} + +fn romanized_name_to_ipa(input: &str) -> Option { + let mut output = String::new(); + let mut token = String::new(); + let mut emitted_word = false; + + for c in input.chars() { + if is_name_token_char(c) { + token.push(c); + continue; + } + + flush_name_token(&mut output, &mut token, &mut emitted_word)?; + + if is_separator_like(c) && emitted_word && !output.ends_with(' ') { + output.push(' '); + } + } + + flush_name_token(&mut output, &mut token, &mut emitted_word)?; + + Some(output.trim().to_string()) +} + +fn flush_name_token( + output: &mut String, + token: &mut String, + emitted_word: &mut bool, +) -> Option<()> { + if token.is_empty() { + return Some(()); + } + + let ipa = word_to_ipa(token)?; + if *emitted_word && !output.ends_with(' ') { + output.push(' '); + } + output.push_str(&ipa); + *emitted_word = true; + token.clear(); + Some(()) +} + +fn word_to_ipa(token: &str) -> Option { + let normalized = normalize_name_token(token); + if normalized.is_empty() { + return Some(String::new()); + } + + if let Some(ipa) = lookup_english_word_ipa(&normalized) { + return Some(ipa.to_string()); + } + + if normalized.chars().all(|c| c.is_ascii_digit()) { + if let Some(ipa) = number_to_ipa(&normalized) { + return Some(ipa.to_string()); + } + + let mut output = String::new(); + for digit in normalized.chars() { + let ipa = number_to_ipa(&digit.to_string())?; + output.push_str(ipa); + } + return Some(output); + } + + romaji_to_katakana(&normalized).and_then(|katakana| katakana_to_ipa(&katakana)) +} + +fn is_name_token_char(c: char) -> bool { + c.is_ascii_alphanumeric() + || matches!( + c, + '\'' | '.' | 'Ā' | 'Ī' | 'Ū' | 'Ē' | 'Ō' | 'ā' | 'ī' | 'ū' | 'ē' | 'ō' + ) +} + +fn is_separator_like(c: char) -> bool { + c.is_whitespace() + || matches!( + c, + '-' | '‐' + | '‑' + | '‒' + | '–' + | '—' + | '―' + | '/' + | '・' + | '・' + | '·' + | '(' + | ')' + | '(' + | ')' + | ',' + | '、' + ) +} + +fn normalize_name_token(token: &str) -> String { + token + .trim_matches(|c: char| !is_name_token_char(c)) + .trim_end_matches('.') + .chars() + .flat_map(normalize_name_char) + .collect::() + .to_lowercase() +} + +fn normalize_name_char(c: char) -> Vec { + match c { + 'Ā' | 'ā' => vec!['a', 'a'], + 'Ī' | 'ī' => vec!['i', 'i'], + 'Ū' | 'ū' => vec!['u', 'u'], + 'Ē' | 'ē' => vec!['e', 'i'], + 'Ō' | 'ō' => vec!['o', 'u'], + _ => vec![c], + } +} + +fn lookup_english_word_ipa(word: &str) -> Option<&'static str> { + match word { + "airport" => Some("ɛɚpɔɹt"), + "and" => Some("ænd"), + "art" => Some("ɑɹt"), + "avenue" => Some("ævənuː"), + "atomic" => Some("ətɑmɪk"), + "beach" => Some("biːtʃ"), + "beer" => Some("bɪɹ"), + "big" => Some("bɪg"), + "blue" => Some("bluː"), + "branch" => Some("bɹæntʃ"), + "bomb" => Some("bɑm"), + "botanical" => Some("bətænɪkəl"), + "building" => Some("bɪldɪŋ"), + "business" => Some("bɪznəs"), + "bus" => Some("bʌs"), + "cable" => Some("keɪbəl"), + "campus" => Some("kæmpəs"), + "castle" => Some("kæsəl"), + "center" | "centre" => Some("sɛntɚ"), + "central" => Some("sɛntɹəl"), + "city" => Some("sɪti"), + "commuter" => Some("kəmjuːtɚ"), + "conference" => Some("kɑnfɚəns"), + "cruise" => Some("kɹuːz"), + "cross" => Some("kɹɔs"), + "district" => Some("dɪstɹɪkt"), + "distribution" => Some("dɪstɹəbjuːʃən"), + "direct" => Some("dɚɛkt"), + "east" => Some("iːst"), + "electric" => Some("ɪlɛktɹɪk"), + "elementary" => Some("ɛləməntɛɹi"), + "entrance" => Some("ɛntɹəns"), + "evening" => Some("iːvnɪŋ"), + "express" => Some("ɪkspɹɛs"), + "family" => Some("fæməli"), + "ferry" => Some("fɛɹi"), + "flower" => Some("flaʊɚ"), + "for" => Some("fɔɹ"), + "from" => Some("fɹʌm"), + "fruit" => Some("fɹuːt"), + "garden" => Some("gɑɹdən"), + "gardens" => Some("gɑɹdənz"), + "gateway" => Some("geɪtweɪ"), + "general" => Some("dʒɛnɚəl"), + "golf" => Some("gɑlf"), + "green" => Some("gɹiːn"), + "ground" => Some("gɹaʊnd"), + "gymnasium" => Some("dʒɪmneɪziəm"), + "hall" => Some("hɔl"), + "high" => Some("haɪ"), + "hospital" => Some("hɑspɪtəl"), + "industrial" => Some("ɪndʌstɹiəl"), + "international" => Some("ɪntɚnæʃənəl"), + "island" => Some("aɪlənd"), + "isle" => Some("aɪl"), + "japan" => Some("dʒəpæn"), + "jr" => Some("dʒeɪ ɑɹ"), + "junior" => Some("dʒuːnjɚ"), + "keisei" => Some("keːseː"), + "line" => Some("laɪn"), + "link" => Some("lɪŋk"), + "liner" => Some("laɪnɚ"), + "lrt" => Some("ɛl ɑɹ tiː"), + "limited" => Some("lɪmɪtɪd"), + "local" => Some("loʊkəl"), + "loop" => Some("luːp"), + "main" => Some("meɪn"), + "mae" => Some("mae"), + "management" => Some("mænɪdʒmənt"), + "marine" => Some("məɹiːn"), + "medical" => Some("mɛdɪkəl"), + "metro" => Some("mɛtɹoʊ"), + "monorail" => Some("mɑnoʊɹeɪl"), + "morning" => Some("mɔɹnɪŋ"), + "museum" => Some("mjuːziəm"), + "municipal" => Some("mjuːnɪsəpəl"), + "new" => Some("nuː"), + "north" => Some("nɔɹθ"), + "or" => Some("ɔɹ"), + "park" => Some("pɑɹk"), + "peace" => Some("piːs"), + "port" => Some("pɔɹt"), + "pool" => Some("puːl"), + "railway" => Some("ɹeɪlweɪ"), + "rail" => Some("ɹeɪl"), + "rapid" => Some("ɹæpɪd"), + "red" => Some("ɹɛd"), + "regional" => Some("ɹiːdʒənəl"), + "relay" => Some("ɹiːleɪ"), + "ropeway" => Some("ɹoʊpweɪ"), + "route" => Some("ɹuːt"), + "scenic" => Some("siːnɪk"), + "saint" => Some("seɪnt"), + "school" => Some("skuːl"), + "science" => Some("saɪəns"), + "section" => Some("sɛkʃən"), + "seaside" => Some("siːsaɪd"), + "semi" => Some("sɛmi"), + "senior" => Some("siːnjɚ"), + "shiyakusho" => Some("ɕijakɯɕo"), + "sight" => Some("saɪt"), + "site" => Some("saɪt"), + "skiing" => Some("skiːɪŋ"), + "skytree" => Some("skaɪtɹiː"), + "soccer" => Some("sɑkɚ"), + "south" => Some("saʊθ"), + "space" => Some("speɪs"), + "special" => Some("spɛʃəl"), + "sports" => Some("spɔɹts"), + "square" => Some("skwɛɚ"), + "stadium" => Some("steɪdiəm"), + "station" => Some("steɪʃən"), + "streetcar" => Some("stɹiːtkɑɹ"), + "subway" => Some("sʌbweɪ"), + "service" => Some("sɝvɪs"), + "shuttle" => Some("ʃʌtəl"), + "sub" => Some("sʌb"), + "sunrise" => Some("sʌnɹaɪz"), + "super" => Some("suːpɚ"), + "telecom" => Some("tɛləkɑm"), + "teleport" => Some("tɛləpɔɹt"), + "terminal" => Some("tɚmɪnəl"), + "the" => Some("ðə"), + "town" => Some("taʊn"), + "to" => Some("tuː"), + "trade" => Some("tɹeɪd"), + "train" => Some("tɹeɪn"), + "transit" => Some("tɹænsɪt"), + "tramway" => Some("tɹæmweɪ"), + "tram" => Some("tɹæm"), + "transport" => Some("tɹænspɔɹt"), + "university" => Some("juːnəvɚsəti"), + "universal" => Some("juːnəvɚsəl"), + "urban" => Some("ɝbən"), + "village" => Some("vɪlɪdʒ"), + "way" => Some("weɪ"), + "west" => Some("wɛst"), + "world" => Some("wɝld"), + "yard" => Some("jɑɹd"), + "railroad" => Some("ɹeɪlɹoʊd"), + "access" => Some("æksɛs"), + "excursion" => Some("ɪkskɝʒən"), + "holiday" => Some("hɑlədeɪ"), + "nonstop" => Some("nɑnstɑp"), + "weekday" => Some("wiːkdeɪ"), + "southern" => Some("sʌðɚn"), + "sky" => Some("skaɪ"), + "office" => Some("ɔfɪs"), + "police" => Some("pəliːs"), + "shrine" => Some("ʃɹaɪn"), + "temple" => Some("tɛmpəl"), + "prefectural" => Some("pɹifɛktʃɚəl"), + "bridge" => Some("bɹɪdʒ"), + "plaza" => Some("plɑːzə"), + "canal" => Some("kənæl"), + "hotel" => Some("hoʊtɛl"), + "cathedral" => Some("kəθiːdɹəl"), + "arts" => Some("ɑɹts"), + "crafts" => Some("kɹæfts"), + "theater" => Some("θiətɚ"), + "abt" => Some("eɪ biː tiː"), + "angelland" => Some("eɪndʒəllænd"), + "arcade" => Some("ɑɹkeɪd"), + "anoh" => Some("ano"), + "astram" => Some("æstɹæm"), + "balloon" => Some("bəluːn"), + "boat" => Some("boʊt"), + "bitchu" => Some("bit͡ɕɯ"), + "bitchuu" => Some("bit͡ɕɯː"), + "bosch" => Some("bɑʃ"), + "car" => Some("kɑɹ"), + "centerpool" => Some("sɛntɚpuːl"), + "centralpark" => Some("sɛntɹəlpɑɹk"), + "chinatown" => Some("tʃaɪnətaʊn"), + "chikucenter" => Some("tʃikjuːsɛntɚ"), + "civic" => Some("sɪvɪk"), + "circuit" => Some("sɝkɪt"), + "cosmosquare" => Some("kɑzmoʊskwɛɚ"), + "dam" => Some("dæm"), + "depot" => Some("diːpoʊ"), + "dinostar" => Some("daɪnoʊstɑɹ"), + "english" => Some("ɪŋglɪʃ"), + "etchu" => Some("ett͡ɕɯ"), + "etchuu" => Some("ett͡ɕɯː"), + "esta" => Some("ɛstə"), + "expo" => Some("ɛkspoʊ"), + "galaxy" => Some("gæləksi"), + "gorge" => Some("gɔɹdʒ"), + "hatchobaba" => Some("hatt͡ɕoːbaba"), + "hatchobori" => Some("hatt͡ɕoːboɾi"), + "huis" => Some("haʊs"), + "itchome" => Some("itt͡ɕoːme"), + "ir" => Some("aɪ ɑɹ"), + "j" => Some("dʒeɪ"), + "juhatchome" => Some("dʑɯːhatt͡ɕoːme"), + "kintestu" => Some("kintetsɯ"), + "kutchan" => Some("kɯtt͡ɕaɴ"), + "linimo" => Some("linimo"), + "minoh" => Some("minoː"), + "newtown" => Some("njuːtaʊn"), + "no.1" => Some("nʌmbɚ wʌn"), + "no.6" => Some("nʌmbɚ sɪks"), + "no.7" => Some("nʌmbɚ sɛvən"), + "no.8" => Some("nʌmbɚ eɪt"), + "peach" => Some("piːtʃ"), + "retro" => Some("ɹɛtɹoʊ"), + "rias" => Some("ɹiːəs"), + "shim" => Some("ɕiɴ"), + "side" => Some("saɪd"), + "skyliner" => Some("skaɪlaɪnɚ"), + "skyrail" => Some("skaɪɹeɪl"), + "sonic" => Some("sɑnɪk"), + "saphir" => Some("sæfiɹ"), + "spacia" => Some("speɪʃə"), + "sta" => Some("steɪʃən"), + "sunport" => Some("sʌnpɔɹt"), + "th" => Some("tiː eɪtʃ"), + "through" => Some("θɹuː"), + "thunderbird" => Some("θʌndɚbɝd"), + "tj" => Some("tiː dʒeɪ"), + "wing" => Some("wɪŋ"), + "woody" => Some("wʊdi"), + "x" => Some("ɛks"), + "aqua" => Some("ækwə"), + "lavender" => Some("lævəndɚ"), + "lilac" => Some("laɪlæk"), + "okhotsk" => Some("oʊkhɑtsk"), + "b" => Some("biː"), + "crossbay" => Some("kɹɔsbeɪ"), + "farm" => Some("fɑɹm"), + "field" => Some("fiːld"), + "gala" => Some("gɑːlə"), + "girls" => Some("gɝlz"), + "grand" => Some("gɹænd"), + "highland" => Some("haɪlənd"), + "hills" => Some("hɪlz"), + "harmonyhall" => Some("hɑɹmənihɔl"), + "harborland" => Some("hɑɹbɚlænd"), + "heartpia" => Some("hɑɹtpiə"), + "land" => Some("lænd"), + "laketown" => Some("leɪktaʊn"), + "mall" => Some("mɔl"), + "mary's" => Some("mɛɹiz"), + "mt" => Some("maʊnt"), + "mt.takao" => Some("maʊnt taka.o"), + "mt.fuji" => Some("maʊnt ɸɯdʑi"), + "norfolk" => Some("nɔɹfoʊk"), + "ohmi" => Some("oːmi"), + "oarks" => Some("oʊks"), + "paddy" => Some("pædi"), + "pref" => Some("pɹɛf"), + "costa" => Some("kɔstə"), + "grandberry" => Some("gɹændbɛɹi"), + "fujifilm" => Some("ɸɯdʑifɪɾɯm"), + "fujitec" => Some("ɸɯdʑitek"), + "intec" => Some("ɪntek"), + "jatco" => Some("dʒætkoʊ"), + "s" => Some("ɛs"), + "t" => Some("tiː"), + "trans" => Some("tɹæns"), + "zoological" => Some("zuːəlɑdʒɪkəl"), + _ => None, + } +} + +fn number_to_ipa(word: &str) -> Option<&'static str> { + match word { + "0" => Some("zɪɹoʊ"), + "1" => Some("wʌn"), + "2" => Some("tuː"), + "3" => Some("θɹiː"), + "4" => Some("fɔɹ"), + "5" => Some("faɪv"), + "6" => Some("sɪks"), + "7" => Some("sɛvən"), + "8" => Some("eɪt"), + "9" => Some("naɪn"), + _ => None, + } +} + +fn romaji_to_katakana(input: &str) -> Option { + if input.is_empty() { + return Some(String::new()); + } + + let chars: Vec = input.chars().collect(); + let mut out = String::new(); + let mut i = 0; + + while i < chars.len() { + if chars[i] == '\'' { + i += 1; + continue; + } + + if i + 1 < chars.len() + && chars[i] == chars[i + 1] + && chars[i] != 'n' + && is_romaji_consonant(chars[i]) + { + out.push('ッ'); + i += 1; + continue; + } + + if chars[i] == 'n' || (chars[i] == 'm' && i + 1 < chars.len() && is_bilabial(chars[i + 1])) + { + if i + 1 == chars.len() { + out.push('ン'); + i += 1; + continue; + } + + let next = chars[i + 1]; + if next == 'n' { + out.push('ン'); + i += 1; + continue; + } + + if !is_romaji_vowel(next) && next != 'y' { + out.push('ン'); + i += 1; + continue; + } + } + + if let Some((kana, consumed)) = match_romaji_chunk(&chars[i..]) { + out.push_str(kana); + i += consumed; + continue; + } + + return None; + } + + Some(out) +} + +fn is_romaji_vowel(c: char) -> bool { + matches!(c, 'a' | 'i' | 'u' | 'e' | 'o') +} + +fn is_romaji_consonant(c: char) -> bool { + c.is_ascii_alphabetic() && !is_romaji_vowel(c) +} + +fn is_bilabial(c: char) -> bool { + matches!(c, 'b' | 'p' | 'm') +} + +fn match_romaji_chunk(chars: &[char]) -> Option<(&'static str, usize)> { + const MAP: &[(&str, &str)] = &[ + ("ltsu", "ッ"), + ("xtsu", "ッ"), + ("kya", "キャ"), + ("kyu", "キュ"), + ("kyo", "キョ"), + ("gya", "ギャ"), + ("gyu", "ギュ"), + ("gyo", "ギョ"), + ("sha", "シャ"), + ("shu", "シュ"), + ("sho", "ショ"), + ("sya", "シャ"), + ("syu", "シュ"), + ("syo", "ショ"), + ("cha", "チャ"), + ("chu", "チュ"), + ("cho", "チョ"), + ("tya", "チャ"), + ("tyu", "チュ"), + ("tyo", "チョ"), + ("nya", "ニャ"), + ("nyu", "ニュ"), + ("nyo", "ニョ"), + ("hya", "ヒャ"), + ("hyu", "ヒュ"), + ("hyo", "ヒョ"), + ("mya", "ミャ"), + ("myu", "ミュ"), + ("myo", "ミョ"), + ("rya", "リャ"), + ("ryu", "リュ"), + ("ryo", "リョ"), + ("bya", "ビャ"), + ("byu", "ビュ"), + ("byo", "ビョ"), + ("pya", "ピャ"), + ("pyu", "ピュ"), + ("pyo", "ピョ"), + ("ja", "ジャ"), + ("ju", "ジュ"), + ("jo", "ジョ"), + ("jya", "ジャ"), + ("jyu", "ジュ"), + ("jyo", "ジョ"), + ("shi", "シ"), + ("chi", "チ"), + ("tsu", "ツ"), + ("fu", "フ"), + ("ji", "ジ"), + ("ka", "カ"), + ("ki", "キ"), + ("ku", "ク"), + ("ke", "ケ"), + ("ko", "コ"), + ("ga", "ガ"), + ("gi", "ギ"), + ("gu", "グ"), + ("ge", "ゲ"), + ("go", "ゴ"), + ("sa", "サ"), + ("su", "ス"), + ("se", "セ"), + ("so", "ソ"), + ("za", "ザ"), + ("zu", "ズ"), + ("ze", "ゼ"), + ("zo", "ゾ"), + ("ta", "タ"), + ("te", "テ"), + ("to", "ト"), + ("da", "ダ"), + ("de", "デ"), + ("do", "ド"), + ("na", "ナ"), + ("ni", "ニ"), + ("nu", "ヌ"), + ("ne", "ネ"), + ("no", "ノ"), + ("ha", "ハ"), + ("hi", "ヒ"), + ("he", "ヘ"), + ("ho", "ホ"), + ("ba", "バ"), + ("bi", "ビ"), + ("bu", "ブ"), + ("be", "ベ"), + ("bo", "ボ"), + ("pa", "パ"), + ("pi", "ピ"), + ("pu", "プ"), + ("pe", "ペ"), + ("po", "ポ"), + ("ma", "マ"), + ("mi", "ミ"), + ("mu", "ム"), + ("me", "メ"), + ("mo", "モ"), + ("ya", "ヤ"), + ("yu", "ユ"), + ("yo", "ヨ"), + ("ra", "ラ"), + ("ri", "リ"), + ("ru", "ル"), + ("re", "レ"), + ("ro", "ロ"), + ("wa", "ワ"), + ("wo", "ヲ"), + ("va", "ヴァ"), + ("vi", "ヴィ"), + ("vu", "ヴ"), + ("ve", "ヴェ"), + ("vo", "ヴォ"), + ("a", "ア"), + ("i", "イ"), + ("u", "ウ"), + ("e", "エ"), + ("o", "オ"), + ]; + + for (roman, kana) in MAP { + if chars.len() < roman.len() { + continue; + } + if chars.iter().take(roman.len()).copied().eq(roman.chars()) { + return Some((*kana, roman.len())); + } + } + + None +} + /// Look up a two-character (digraph) combination. /// Handles palatalized sounds (拗音): キャ, シュ, チョ, etc. fn lookup_digraph(c1: char, c2: char) -> Option { @@ -578,6 +1198,78 @@ mod tests { assert_eq!(katakana_to_ipa("シブヤX"), None); } + #[test] + fn test_station_name_ipa_uses_official_english_wording() { + assert_eq!( + station_name_to_ipa("カサイリンカイコウエン", Some("Kasai-Rinkai Park")), + Some("kasa.i ɾiŋka.i pɑɹk".to_string()) + ); + } + + #[test] + fn test_station_name_ipa_supports_english_and_digits() { + assert_eq!( + station_name_to_ipa("ナリタクウコウ", Some("Narita Airport Terminal 1")), + Some("naɾita ɛɚpɔɹt tɚmɪnəl wʌn".to_string()) + ); + } + + #[test] + fn test_station_name_ipa_supports_multi_digit_numbers() { + assert_eq!( + station_name_to_ipa("ハネダクウコウ", Some("Haneda Airport Terminal 10")), + Some("haneda ɛɚpɔɹt tɚmɪnəl wʌnzɪɹoʊ".to_string()) + ); + } + + #[test] + fn test_station_name_ipa_falls_back_to_katakana_when_roman_parse_fails() { + assert_eq!( + station_name_to_ipa("シブヤ", Some("???")), + Some("ɕibɯja".to_string()) + ); + } + + #[test] + fn test_station_name_ipa_supports_mixed_english_facility_words() { + assert_eq!( + station_name_to_ipa("トウキョウビッグサイト", Some("Tōkyō Big Sight")), + Some("to.ɯkʲo.ɯ bɪg saɪt".to_string()) + ); + } + + #[test] + fn test_station_name_ipa_supports_common_line_words() { + assert_eq!( + station_name_to_ipa("ヤマノテセン", Some("Yamanote Line")), + Some("jamanote laɪn".to_string()) + ); + } + + #[test] + fn test_station_name_ipa_supports_bilabial_m_in_romaji() { + assert_eq!( + station_name_to_ipa("シンバシ", Some("Shimbashi")), + Some("ɕimbaɕi".to_string()) + ); + } + + #[test] + fn test_station_name_ipa_supports_line_related_english_words() { + assert_eq!( + station_name_to_ipa("トウザイセン", Some("Municipal Subway Blue Line")), + Some("mjuːnɪsəpəl sʌbweɪ bluː laɪn".to_string()) + ); + } + + #[test] + fn test_station_name_ipa_supports_train_type_words() { + assert_eq!( + station_name_to_ipa("カイソク", Some("Commuter Rapid")), + Some("kəmjuːtɚ ɹæpɪd".to_string()) + ); + } + #[test] fn test_dokkyo_daigakumae_soka_matsubara() { // Full-width space between words should be preserved diff --git a/stationapi/src/use_case/dto/line.rs b/stationapi/src/use_case/dto/line.rs index 967f4b1f..0fa9692f 100644 --- a/stationapi/src/use_case/dto/line.rs +++ b/stationapi/src/use_case/dto/line.rs @@ -1,7 +1,7 @@ use crate::{ domain::{ entity::{gtfs::TransportType, line::Line}, - ipa::{katakana_to_ipa, replace_line_name_suffix}, + ipa::{katakana_to_ipa, replace_line_name_suffix, station_name_to_ipa}, }, proto::{Line as GrpcLine, TransportType as GrpcTransportType}, }; @@ -13,6 +13,8 @@ impl From for GrpcLine { katakana_to_ipa(stem).map(|ipa| format!("{ipa}{suffix_ipa}")) } .filter(|ipa| !ipa.is_empty()); + let name_roman_ipa = + station_name_to_ipa("", line.line_name_r.as_deref()).filter(|ipa| !ipa.is_empty()); // バス路線の場合は line_type を OtherLineType (0) に強制 // (鉄道用の line_type が誤って設定されている可能性があるため) let line_type = if line.transport_type == TransportType::Bus { @@ -41,6 +43,7 @@ impl From for GrpcLine { average_distance: line.average_distance.unwrap_or(0.0), transport_type: convert_transport_type(line.transport_type), name_ipa, + name_roman_ipa, } } } @@ -374,12 +377,12 @@ mod tests { } // ============================================ - // name_ipa 変換テスト + // IPA 変換テスト // ============================================ #[test] fn test_name_ipa_sen_suffix_replaced_with_line() { - // 〜セン → IPA + " laɪn" + // name_ipa はカタカナ由来 let mut line = create_test_line(TransportType::Rail, None); line.line_name_k = "セイブイケブクロセン".to_string(); let grpc_line: GrpcLine = line.into(); @@ -392,7 +395,6 @@ mod tests { #[test] fn test_name_ipa_honsen_suffix_replaced_with_main_line() { - // 〜ホンセン → IPA + " meɪn laɪn" let mut line = create_test_line(TransportType::Rail, None); line.line_name_k = "トウカイドウホンセン".to_string(); let grpc_line: GrpcLine = line.into(); @@ -405,11 +407,35 @@ mod tests { #[test] fn test_name_ipa_shinkansen_preserved() { - // 〜シンカンセン は英語でもそのまま使われるため置換しない let mut line = create_test_line(TransportType::Rail, None); line.line_name_k = "トウホクシンカンセン".to_string(); let grpc_line: GrpcLine = line.into(); assert_eq!(grpc_line.name_ipa, Some("to.ɯhokɯɕiŋkanseɴ".to_string())); } + + #[test] + fn test_name_roman_ipa_prefers_romanized_line_name_for_keisei() { + let mut line = create_test_line(TransportType::Rail, None); + line.line_name_k = "ケイセイホンセン".to_string(); + line.line_name_r = Some("Keisei Main Line".to_string()); + let grpc_line: GrpcLine = line.into(); + + assert_eq!(grpc_line.name_ipa, Some("ke.ise.i meɪn laɪn".to_string())); + assert_eq!( + grpc_line.name_roman_ipa, + Some("keːseː meɪn laɪn".to_string()) + ); + } + + #[test] + fn test_name_ipa_empty_result_is_normalized_to_none() { + let mut line = create_test_line(TransportType::Rail, None); + line.line_name_k = "".to_string(); + line.line_name_r = None; + let grpc_line: GrpcLine = line.into(); + + assert_eq!(grpc_line.name_ipa, None); + assert_eq!(grpc_line.name_roman_ipa, None); + } } diff --git a/stationapi/src/use_case/dto/station.rs b/stationapi/src/use_case/dto/station.rs index 6e52cc19..58fbad3a 100644 --- a/stationapi/src/use_case/dto/station.rs +++ b/stationapi/src/use_case/dto/station.rs @@ -1,7 +1,7 @@ use crate::{ domain::{ entity::{gtfs::TransportType, station::Station}, - ipa::katakana_to_ipa, + ipa::{katakana_to_ipa, station_name_to_ipa}, }, proto::{Station as GrpcStation, TransportType as GrpcTransportType}, }; @@ -18,6 +18,7 @@ impl From for i32 { impl From for GrpcStation { fn from(station: Station) -> Self { let name_ipa = katakana_to_ipa(&station.station_name_k).filter(|ipa| !ipa.is_empty()); + let name_roman_ipa = station_name_to_ipa("", station.station_name_r.as_deref()); Self { id: station.station_cd as u32, group_id: station.station_g_cd as u32, @@ -48,6 +49,7 @@ impl From for GrpcStation { train_type: station.train_type.map(|tt| Box::new((*tt).into())), transport_type: station.transport_type.into(), name_ipa, + name_roman_ipa, } } } diff --git a/stationapi/src/use_case/dto/train_type.rs b/stationapi/src/use_case/dto/train_type.rs index 03443e8f..d3de503a 100644 --- a/stationapi/src/use_case/dto/train_type.rs +++ b/stationapi/src/use_case/dto/train_type.rs @@ -1,4 +1,10 @@ -use crate::{domain::entity::train_type::TrainType, proto::TrainType as GrpcTrainType}; +use crate::{ + domain::{ + entity::train_type::TrainType, + ipa::{katakana_to_ipa, station_name_to_ipa}, + }, + proto::TrainType as GrpcTrainType, +}; impl From for GrpcTrainType { fn from(train_type: TrainType) -> Self { @@ -19,6 +25,8 @@ impl From for GrpcTrainType { lines, kind, } = train_type; + let name_ipa = katakana_to_ipa(&type_name_k).filter(|ipa| !ipa.is_empty()); + let name_roman_ipa = station_name_to_ipa("", type_name_r.as_deref()); Self { id: id.map(|id| id as u32).unwrap_or(0), type_id: type_cd.map(|id| id as u32).unwrap_or(0), @@ -33,6 +41,39 @@ impl From for GrpcTrainType { lines: lines.into_iter().map(|line| line.into()).collect(), direction: direction.unwrap_or(0), kind: kind.unwrap_or(0), + name_ipa, + name_roman_ipa, } } } + +#[cfg(test)] +mod tests { + use super::*; + + fn create_test_train_type() -> TrainType { + TrainType::new( + Some(1), + Some(1130201), + Some(1001), + Some(1001), + Some(1), + "快速".to_string(), + "カイソク".to_string(), + Some("Rapid".to_string()), + Some("快速".to_string()), + Some("쾌속".to_string()), + "#ff6600".to_string(), + Some(0), + Some(1), + ) + } + + #[test] + fn test_train_type_sets_katakana_and_roman_ipa() { + let grpc_train_type: GrpcTrainType = create_test_train_type().into(); + + assert_eq!(grpc_train_type.name_ipa, Some("ka.isokɯ".to_string())); + assert_eq!(grpc_train_type.name_roman_ipa, Some("ɹæpɪd".to_string())); + } +} diff --git a/stationapi/src/use_case/interactor/query.rs b/stationapi/src/use_case/interactor/query.rs index ee3f30b9..c74118f9 100644 --- a/stationapi/src/use_case/interactor/query.rs +++ b/stationapi/src/use_case/interactor/query.rs @@ -836,6 +836,8 @@ where let name_ipa = crate::domain::ipa::katakana_to_ipa(&row.station_name_k) .filter(|ipa| !ipa.is_empty()); + let name_roman_ipa = + crate::domain::ipa::station_name_to_ipa("", row.station_name_r.as_deref()); proto::StationMinimal { id: row.station_cd as u32, group_id: row.station_g_cd as u32, @@ -848,6 +850,7 @@ where has_train_types: Some(row.type_id.is_some()), train_type_id: row.type_id.map(|id| id as u32), name_ipa, + name_roman_ipa, } }) .collect::>(); diff --git a/tools/ipa_audit.rs b/tools/ipa_audit.rs new file mode 100644 index 00000000..40828825 --- /dev/null +++ b/tools/ipa_audit.rs @@ -0,0 +1,163 @@ +use std::collections::{BTreeMap, BTreeSet}; +use std::fs::File; +use std::io::{BufRead, BufReader}; + +include!("../stationapi/src/domain/ipa.rs"); + +struct Dataset { + label: &'static str, + path: &'static str, + roman_column: usize, +} + +#[derive(Default)] +struct TokenStats { + count: usize, + examples: BTreeSet, +} + +fn main() -> Result<(), Box> { + let datasets = [ + Dataset { + label: "lines", + path: "data/2!lines.csv", + roman_column: 5, + }, + Dataset { + label: "stations", + path: "data/3!stations.csv", + roman_column: 4, + }, + Dataset { + label: "train_types", + path: "data/4!types.csv", + roman_column: 4, + }, + ]; + + for dataset in datasets { + audit_dataset(&dataset)?; + } + + Ok(()) +} + +fn audit_dataset(dataset: &Dataset) -> Result<(), Box> { + let file = File::open(dataset.path)?; + let reader = BufReader::new(file); + let mut total_names = 0usize; + let mut unresolved_names = 0usize; + let mut unresolved_tokens: BTreeMap = BTreeMap::new(); + + for (index, line) in reader.lines().enumerate() { + let line = line?; + if index == 0 { + continue; + } + + let columns = parse_csv_line(&line); + let Some(name_roman) = columns.get(dataset.roman_column) else { + continue; + }; + let name_roman = name_roman.trim(); + if name_roman.is_empty() { + continue; + } + + total_names += 1; + if romanized_name_to_ipa(name_roman).is_none() { + unresolved_names += 1; + } + + for token in extract_tokens(name_roman) { + if word_to_ipa(&token).is_some() { + continue; + } + let entry = unresolved_tokens.entry(token).or_default(); + entry.count += 1; + if entry.examples.len() < 3 { + entry.examples.insert(name_roman.to_string()); + } + } + } + + println!( + "[{}] names: {} total / {} unresolved", + dataset.label, total_names, unresolved_names + ); + + if unresolved_tokens.is_empty() { + println!("[{}] unresolved tokens: none", dataset.label); + println!(); + return Ok(()); + } + + println!("[{}] unresolved tokens:", dataset.label); + let mut sorted_tokens: Vec<_> = unresolved_tokens.into_iter().collect(); + sorted_tokens.sort_by(|a, b| b.1.count.cmp(&a.1.count).then_with(|| a.0.cmp(&b.0))); + + for (token, stats) in sorted_tokens.into_iter().take(40) { + let examples = stats.examples.into_iter().collect::>().join(" / "); + println!(" {} ({}) [{}]", token, stats.count, examples); + } + println!(); + + Ok(()) +} + +fn extract_tokens(input: &str) -> Vec { + let mut tokens = Vec::new(); + let mut current = String::new(); + + for c in input.chars() { + if is_name_token_char(c) { + current.push(c); + continue; + } + + if !current.is_empty() { + let token = normalize_name_token(¤t); + if !token.is_empty() { + tokens.push(token); + } + current.clear(); + } + } + + if !current.is_empty() { + let token = normalize_name_token(¤t); + if !token.is_empty() { + tokens.push(token); + } + } + + tokens +} + +fn parse_csv_line(line: &str) -> Vec { + let mut output = Vec::new(); + let mut current = String::new(); + let mut in_quotes = false; + let mut chars = line.chars().peekable(); + + while let Some(c) = chars.next() { + match c { + '"' => { + if in_quotes && chars.peek() == Some(&'"') { + current.push('"'); + chars.next(); + } else { + in_quotes = !in_quotes; + } + } + ',' if !in_quotes => { + output.push(current); + current = String::new(); + } + _ => current.push(c), + } + } + + output.push(current); + output +}