From c8e1ffdacf9b11b24114b35bf48f58f9ba665095 Mon Sep 17 00:00:00 2001 From: pengzhendong <275331498@qq.com> Date: Mon, 1 Dec 2025 11:32:44 +0800 Subject: [PATCH 1/4] add itn en data --- .gitignore | 4 +- itn/english/data/currency.tsv | 168 ++++++++ itn/english/data/date_period.tsv | 10 + itn/english/data/electronic/domain.tsv | 10 + itn/english/data/electronic/server_name.tsv | 17 + itn/english/data/electronic/symbols.tsv | 22 + itn/english/data/electronic/url_symbols.tsv | 5 + itn/english/data/magnitudes.tsv | 4 + itn/english/data/measurements.tsv | 145 +++++++ itn/english/data/months.tsv | 12 + itn/english/data/months_cased.tsv | 12 + itn/english/data/numbers/digit.tsv | 9 + itn/english/data/numbers/hundred.tsv | 1 + itn/english/data/numbers/teen.tsv | 10 + itn/english/data/numbers/thousands.tsv | 22 + itn/english/data/numbers/ties.tsv | 9 + itn/english/data/numbers/zero.tsv | 1 + itn/english/data/ordinals/digit.tsv | 9 + itn/english/data/ordinals/teen.tsv | 1 + itn/english/data/time/minute_to.tsv | 59 +++ itn/english/data/time/time_suffix.tsv | 8 + itn/english/data/time/time_suffix_cased.tsv | 8 + itn/english/data/time/time_zone.tsv | 7 + itn/english/data/time/time_zone_cased.tsv | 7 + itn/english/data/time/to_hour.tsv | 12 + itn/english/data/whitelist.tsv | 417 ++++++++++++++++++ itn/english/data/whitelist_tech.tsv | 450 ++++++++++++++++++++ itn/english/data/year_suffix.tsv | 6 + itn/english/test/data/cardinal.txt | 60 +++ itn/english/test/data/date.txt | 70 +++ itn/english/test/data/decimal.txt | 129 ++++++ itn/english/test/data/electronic.txt | 59 +++ itn/english/test/data/measure.txt | 232 ++++++++++ itn/english/test/data/money.txt | 54 +++ itn/english/test/data/ordinal.txt | 34 ++ itn/english/test/data/telephone.txt | 18 + itn/english/test/data/time.txt | 33 ++ itn/english/test/data/whitelist.txt | 9 + itn/english/test/data/word.txt | 49 +++ 39 files changed, 2190 insertions(+), 2 deletions(-) create mode 100644 itn/english/data/currency.tsv create mode 100644 itn/english/data/date_period.tsv create mode 100644 itn/english/data/electronic/domain.tsv create mode 100644 itn/english/data/electronic/server_name.tsv create mode 100644 itn/english/data/electronic/symbols.tsv create mode 100644 itn/english/data/electronic/url_symbols.tsv create mode 100644 itn/english/data/magnitudes.tsv create mode 100644 itn/english/data/measurements.tsv create mode 100644 itn/english/data/months.tsv create mode 100644 itn/english/data/months_cased.tsv create mode 100644 itn/english/data/numbers/digit.tsv create mode 100644 itn/english/data/numbers/hundred.tsv create mode 100644 itn/english/data/numbers/teen.tsv create mode 100644 itn/english/data/numbers/thousands.tsv create mode 100644 itn/english/data/numbers/ties.tsv create mode 100644 itn/english/data/numbers/zero.tsv create mode 100644 itn/english/data/ordinals/digit.tsv create mode 100644 itn/english/data/ordinals/teen.tsv create mode 100644 itn/english/data/time/minute_to.tsv create mode 100644 itn/english/data/time/time_suffix.tsv create mode 100644 itn/english/data/time/time_suffix_cased.tsv create mode 100644 itn/english/data/time/time_zone.tsv create mode 100644 itn/english/data/time/time_zone_cased.tsv create mode 100644 itn/english/data/time/to_hour.tsv create mode 100644 itn/english/data/whitelist.tsv create mode 100644 itn/english/data/whitelist_tech.tsv create mode 100644 itn/english/data/year_suffix.tsv create mode 100644 itn/english/test/data/cardinal.txt create mode 100644 itn/english/test/data/date.txt create mode 100644 itn/english/test/data/decimal.txt create mode 100644 itn/english/test/data/electronic.txt create mode 100644 itn/english/test/data/measure.txt create mode 100644 itn/english/test/data/money.txt create mode 100644 itn/english/test/data/ordinal.txt create mode 100644 itn/english/test/data/telephone.txt create mode 100644 itn/english/test/data/time.txt create mode 100644 itn/english/test/data/whitelist.txt create mode 100644 itn/english/test/data/word.txt diff --git a/.gitignore b/.gitignore index 3a1ec79c..f74e50bf 100644 --- a/.gitignore +++ b/.gitignore @@ -40,5 +40,5 @@ compile_commands.json WeTextProcessing.egg-info/ build/ dist/ -tn/*.far -itn/*.far +tn/*.fst +itn/*.fst diff --git a/itn/english/data/currency.tsv b/itn/english/data/currency.tsv new file mode 100644 index 00000000..2f1fcc09 --- /dev/null +++ b/itn/english/data/currency.tsv @@ -0,0 +1,168 @@ +؋ afghan afghani +l albanian lek +دج algerian dinar +kz angolan kwanza +aed arab emirates dirham +$ argentine peso +֏ armenian dram +ƒ aruban florin +a$ australian dollar +₼ azerbaijani manat +b$ bahamian dollar +.د.ب bahraini dinar +৳ bangladeshi taka +$ barbadian dollar +br belarusian ruble +bz$ belize dollar +$ bermudian dollar +nu bhutanese ngultrum +₿ bitcoin +bs bolivian boliviano +km bosnia and herzegovina convertible mark +p botswana pula +r$ brazilian real +£ british pound +b$ brunei dollar +лв bulgarian lev +fbu burundian franc +៛ cambodian riel +ca$ canadian dollar +cve cape verde escudo +cve cape verdean escudo +ci$ cayman islands dollar +cfa cfa franc +$ chilean peso +元 chinese yuan +$ colombian peso +cf comorian franc +fc congolese franc +₡ costa rican colón +kn croatian kuna +cuc$ cuban peso +cyp cypriot pound +kč czech koruna +d dalasi +kr danish krone +fdj djiboutian franc +$ dollar +rd$ dominican peso +$ east caribbean dollar +e£ egyptian pound +aed emirati dirham +nkf eritrean nakfa +Ξ ethereum +br ethiopian birr +€ euro +fk£ falkland islands pound +fim finnish markka +₾ georgian lari +dm german mark +gh₵ ghanaian cedi +q guatemalan quetzal +fg guinean franc +g$ guyanese dollar +g haitian gourde +l honduran lempira +hk$ hong kong dollar +ft hungarian forint +₹ inr +íkr icelandic króna +₹ indian rupee +rp indonesian rupiah +﷼ iranian rial +ع.د iraqi dinar +₪ israeli new shekel +₪ israeli shekel +j$ jamaican dollar +¥ japanese yen +ينار jordanian dinar +₸ kazakhstani tenge +ksh kenyan shilling +ك kuwaiti dinar +k kyat +som kyrgyzstani som +₭ lao kip +ლარი lari +ل.ل lebanese pound +l lesotho loti +ld$ liberian dollar +ld libyan dinar +l lilangeni +ł litecoin +mop$ macanese pataca +ar malagasy ariary +k malawian kwacha +rm malaysian ringgit +rf maldivian rufiyaa +mro mauritanian ouguiya +₨ mauritian rupee +$ mexican peso +l moldovan leu +ɱ monero +₮ mongolian tögrög +dh moroccan dirham +mt mozambican metical +n$ namibian dollar +rs nepalese rupee +ƒ netherlands antillean guilder +nt$ new taiwan dollar +nz$ new zealand dollar +c$ nicaraguan córdoba +₦ nigerian naira +₩ north korean won +kr norwegian krone +ر.ع omani rial +um ouguiya +pkr pakistani rupee +b/. panamanian balboa +₲ paraguayan guaraní +s/. peruvian sol +₱ philippine peso +zł polish zloty +£ pounds sterling +ر.ق qatari riyal +£ quid +r real +xrp ripples +lei romanian leu +₹ rupee +₽ russian ruble +r₣ rwandan franc +shp saint helena pounds +db são tomé and príncipe dobra +sar saudi riyal +ден second macedonian denar +rsd serbian dinar +scr seychelles rupee +le sierra leonean leone +s$ singapore dollar +sh.so. somali shilling +tjs somoni +r south african rand +₩ south korean won +ss£ south sudanese pound +රු sri lankan rupee +sdg sudanese pound +sr$ surinamese dollar +kr swedish krona +chf swiss franc +£s syrian pound +৳ taka +tzs tanzanian shilling +₮ tether +฿ thai baht +tt$ trinidad and tobago dollar +د.ت tunisian dinar +₺ turkish lira +m turkmen new manat +ush ugandan shilling +₴ ukrainian hryvna +$ united states dollar +$u uruguayan peso +$ us dollar +som uzbekistan som +bs. venezuelan bolívar +₫ vietnamese đồng +₩ won +﷼ yemeni rial +¥ yen diff --git a/itn/english/data/date_period.tsv b/itn/english/data/date_period.tsv new file mode 100644 index 00000000..b53ef7db --- /dev/null +++ b/itn/english/data/date_period.tsv @@ -0,0 +1,10 @@ +Q1 first quarter +Q1 q one +Q2 second quarter +Q2 q two +Q3 third quarter +Q3 q three +Q4 fourth quarter +Q4 q four +H1 first half +H2 second half \ No newline at end of file diff --git a/itn/english/data/electronic/domain.tsv b/itn/english/data/electronic/domain.tsv new file mode 100644 index 00000000..207a14ae --- /dev/null +++ b/itn/english/data/electronic/domain.tsv @@ -0,0 +1,10 @@ +com +uk +fr +net +br +in +ru +de +it +ai diff --git a/itn/english/data/electronic/server_name.tsv b/itn/english/data/electronic/server_name.tsv new file mode 100644 index 00000000..ca7f97bc --- /dev/null +++ b/itn/english/data/electronic/server_name.tsv @@ -0,0 +1,17 @@ +g mail gmail +gmail +n vidia nvidia +nvidia +outlook +hotmail +yahoo +aol +gmx +msn +live +yandex +orange +wanadoo +web +comcast +google diff --git a/itn/english/data/electronic/symbols.tsv b/itn/english/data/electronic/symbols.tsv new file mode 100644 index 00000000..5be8da87 --- /dev/null +++ b/itn/english/data/electronic/symbols.tsv @@ -0,0 +1,22 @@ +. dot +- dash +- hyphen +_ underscore +! exclamation mark +# number sign +$ dollar sign +% percent sign +& ampersand +' quote +* asterisk ++ plus +/ slash += equal sign +? question mark +^ circumflex +` right single quote +{ left brace +| vertical bar +} right brace +~ tilde +, comma \ No newline at end of file diff --git a/itn/english/data/electronic/url_symbols.tsv b/itn/english/data/electronic/url_symbols.tsv new file mode 100644 index 00000000..2946fde3 --- /dev/null +++ b/itn/english/data/electronic/url_symbols.tsv @@ -0,0 +1,5 @@ +. dot +- dash +- hyphen +_ underscore +/ slash \ No newline at end of file diff --git a/itn/english/data/magnitudes.tsv b/itn/english/data/magnitudes.tsv new file mode 100644 index 00000000..3bfa4849 --- /dev/null +++ b/itn/english/data/magnitudes.tsv @@ -0,0 +1,4 @@ +k thousand +m million +b billion +t trillion \ No newline at end of file diff --git a/itn/english/data/measurements.tsv b/itn/english/data/measurements.tsv new file mode 100644 index 00000000..894eacd4 --- /dev/null +++ b/itn/english/data/measurements.tsv @@ -0,0 +1,145 @@ +°F fahrenheit +°C celsius +°F degree fahrenheit +°C degree celsius +K kelvin +km kilometer +m meter +cm centimeter +mm millimeter +ha hectare +mi mile +m² square meter +km² square kilometer +ft foot +% percent +hz hertz +kW kilowatt +kW kilo watt +kWh kilo watt hour +kWh kilowatt hour +Wh watt hour +W watt +hp horsepower +hp horse power +mg milligram +kg kilogram +ghz gigahertz +khz kilohertz +mhz megahertz +v volt +h hour +mc mega coulomb +s second +nm nanometer +rpm revolution per minute +min minute +mA milli ampere +% per cent +m³ cubic meter +mph mile per hour +tW tera watt +mv milli volt +mW megawatt +mW mega watt +μm micrometer +" inch +cc c c +g gram +da dalton +atm atmosphere +ω ohm +db decibel +ps peta second +oz ounce +hl hecto liter +μg microgram +pg petagram +kb kilobit +mb megabit +gb gigabit +tb terabit +pb petabit +ev electron volt +mb megabyte +kb kilobyte +gb gigabyte +tb terabyte +pb peta byte +bps bit per second +kbps kilobit per second +mbps megabit per second +gbps gigabit per second +kbps kilo bit per second +mbps mega bit per second +mbps mega bit per second +tbps terabit per second +tbps tera bit per second +pbps petabit per second +pbps peta bit per second +kb/s kilobyte per second +kb/s kilo byte per second +mb/s megabyte per second +mb/s mega byte per second +gb/s gigabyte per second +gb/s giga byte per second +tb/s terabyte per second +tb/s tera byte per second +pb/s petabyte per second +pb/s peta byte per second +st stone +kl kilo liter +tj tera joule +kv kilo volt +mv mega volt +kn kilo newton +kn kilonewton +mm megameter +au astronomical unit +yd yard +rad radian +lm lumen +hs hecto second +mol mole +gpa giga pascal +ml milliliter +gW gigawatt +gW gigaWatt +A ampere +mA mili ampere +µA micro ampere +MA mega ampere +kt knot +kgf kilogram force +ng nano gram +ns nanosecond +ms mega siemens +bar bar +gl giga liter +μs microsecond +dA deci ampere +pa pascal +ds deci second +ms milli second +dm deci meter +dm³ cubic deci meter +amu atomic mass unit +mf mega farad +bq becquerel +pb petabit +mm² square millimeter +cm² square centimeter +sq mi square mile +sq ft square foot +kpa kilopascal +cd candela +tl tera liter +ms mega second +mpa megapascal +pm peta meter +gWh giga watt hour +kcal kilo calory +gy gray +sv sievert +cwt hundredweight +cc c c diff --git a/itn/english/data/months.tsv b/itn/english/data/months.tsv new file mode 100644 index 00000000..1c05ac02 --- /dev/null +++ b/itn/english/data/months.tsv @@ -0,0 +1,12 @@ +january +february +march +april +may +june +july +august +september +october +november +december diff --git a/itn/english/data/months_cased.tsv b/itn/english/data/months_cased.tsv new file mode 100644 index 00000000..750d7937 --- /dev/null +++ b/itn/english/data/months_cased.tsv @@ -0,0 +1,12 @@ +January +February +March +April +May +June +July +August +September +October +November +December diff --git a/itn/english/data/numbers/digit.tsv b/itn/english/data/numbers/digit.tsv new file mode 100644 index 00000000..fa329799 --- /dev/null +++ b/itn/english/data/numbers/digit.tsv @@ -0,0 +1,9 @@ +one 1 +two 2 +three 3 +four 4 +five 5 +six 6 +seven 7 +eight 8 +nine 9 \ No newline at end of file diff --git a/itn/english/data/numbers/hundred.tsv b/itn/english/data/numbers/hundred.tsv new file mode 100644 index 00000000..10e2b964 --- /dev/null +++ b/itn/english/data/numbers/hundred.tsv @@ -0,0 +1 @@ +hundred \ No newline at end of file diff --git a/itn/english/data/numbers/teen.tsv b/itn/english/data/numbers/teen.tsv new file mode 100644 index 00000000..8e60fa1d --- /dev/null +++ b/itn/english/data/numbers/teen.tsv @@ -0,0 +1,10 @@ +ten 10 +eleven 11 +twelve 12 +thirteen 13 +fourteen 14 +fifteen 15 +sixteen 16 +seventeen 17 +eighteen 18 +nineteen 19 \ No newline at end of file diff --git a/itn/english/data/numbers/thousands.tsv b/itn/english/data/numbers/thousands.tsv new file mode 100644 index 00000000..bf30542b --- /dev/null +++ b/itn/english/data/numbers/thousands.tsv @@ -0,0 +1,22 @@ +thousand +million +billion +trillion +quadrillion +quintillion +sextillion +septillion +octillion +nonillion +decillion +undecillion +duodecillion +tredecillion +quattuordecillion +quindecillion +sexdecillion +septendecillion +octodecillion +novemdecillion +vigintillion +centillion \ No newline at end of file diff --git a/itn/english/data/numbers/ties.tsv b/itn/english/data/numbers/ties.tsv new file mode 100644 index 00000000..d7ec82e1 --- /dev/null +++ b/itn/english/data/numbers/ties.tsv @@ -0,0 +1,9 @@ +twenty 2 +thirty 3 +forty 4 +fourty 4 +fifty 5 +sixty 6 +seventy 7 +eighty 8 +ninety 9 \ No newline at end of file diff --git a/itn/english/data/numbers/zero.tsv b/itn/english/data/numbers/zero.tsv new file mode 100644 index 00000000..c479272d --- /dev/null +++ b/itn/english/data/numbers/zero.tsv @@ -0,0 +1 @@ +zero 0 \ No newline at end of file diff --git a/itn/english/data/ordinals/digit.tsv b/itn/english/data/ordinals/digit.tsv new file mode 100644 index 00000000..4b7fc24f --- /dev/null +++ b/itn/english/data/ordinals/digit.tsv @@ -0,0 +1,9 @@ +first one +second two +third three +fourth four +fifth five +sixth sixth +seventh seven +eighth eight +ninth nine \ No newline at end of file diff --git a/itn/english/data/ordinals/teen.tsv b/itn/english/data/ordinals/teen.tsv new file mode 100644 index 00000000..496fefc0 --- /dev/null +++ b/itn/english/data/ordinals/teen.tsv @@ -0,0 +1 @@ +twelfth twelve \ No newline at end of file diff --git a/itn/english/data/time/minute_to.tsv b/itn/english/data/time/minute_to.tsv new file mode 100644 index 00000000..edab4d5b --- /dev/null +++ b/itn/english/data/time/minute_to.tsv @@ -0,0 +1,59 @@ +1 59 +2 58 +3 57 +4 56 +5 55 +6 54 +7 53 +8 52 +9 51 +10 50 +11 49 +12 48 +13 47 +14 46 +15 45 +16 44 +17 43 +18 42 +19 41 +20 40 +21 39 +22 38 +23 37 +24 36 +25 35 +26 34 +27 33 +28 32 +29 31 +30 30 +31 29 +32 28 +33 27 +34 26 +35 25 +36 24 +37 23 +38 22 +39 21 +40 20 +41 19 +42 18 +43 17 +44 16 +45 15 +46 14 +47 13 +48 12 +49 11 +50 10 +51 9 +52 8 +53 7 +54 6 +55 5 +56 4 +57 3 +58 2 +59 1 diff --git a/itn/english/data/time/time_suffix.tsv b/itn/english/data/time/time_suffix.tsv new file mode 100644 index 00000000..b39a9ef9 --- /dev/null +++ b/itn/english/data/time/time_suffix.tsv @@ -0,0 +1,8 @@ +p m p.m. +pm p.m. +p.m. +p.m p.m. +am a.m. +a.m. +a.m a.m. +a m a.m. \ No newline at end of file diff --git a/itn/english/data/time/time_suffix_cased.tsv b/itn/english/data/time/time_suffix_cased.tsv new file mode 100644 index 00000000..97a4cf5c --- /dev/null +++ b/itn/english/data/time/time_suffix_cased.tsv @@ -0,0 +1,8 @@ +P M P.M. +PM P.M. +P.M. +P.M P.M. +AM A.M. +A.M. +A.M A.M. +A M A.M. \ No newline at end of file diff --git a/itn/english/data/time/time_zone.tsv b/itn/english/data/time/time_zone.tsv new file mode 100644 index 00000000..3e0ade46 --- /dev/null +++ b/itn/english/data/time/time_zone.tsv @@ -0,0 +1,7 @@ +cst c s t +cet c e t +pst p s t +est e s t +pt p t +et e t +gmt g m t diff --git a/itn/english/data/time/time_zone_cased.tsv b/itn/english/data/time/time_zone_cased.tsv new file mode 100644 index 00000000..65c70da2 --- /dev/null +++ b/itn/english/data/time/time_zone_cased.tsv @@ -0,0 +1,7 @@ +CST C S T +CET C E T +PST P S T +EST E S T +PT P T +ET E T +GMT G M T diff --git a/itn/english/data/time/to_hour.tsv b/itn/english/data/time/to_hour.tsv new file mode 100644 index 00000000..fba67f1b --- /dev/null +++ b/itn/english/data/time/to_hour.tsv @@ -0,0 +1,12 @@ +one 12 +two 1 +three 2 +four 3 +five 4 +six 5 +seven 6 +eight 7 +nine 8 +ten 9 +eleven 10 +twelve 11 \ No newline at end of file diff --git a/itn/english/data/whitelist.tsv b/itn/english/data/whitelist.tsv new file mode 100644 index 00000000..c7201aef --- /dev/null +++ b/itn/english/data/whitelist.tsv @@ -0,0 +1,417 @@ +10K ten k +1D one d +1G one g +1K one k +2.5G two point five g +2D two d +2G two g +2K two k +3D three d +3G three g +4D four d +4G four g +4K four k +5D five d +5G five g +6G six g +7-eleven seven eleven +7G seven g +8K eight k +AAA triple a +AC a c +ACL a c l +ADAS a das +AEB a e b +AES a e s +AGX a g x +AI a i +AI ai +AI ais +ALU a l u +AMD a m d +API a p i +API api +ARP a r p +ASAP a s a p +ASAP a sap +ASMC a s m c +ASR a s r +ASUS a sus +ASUS asus +AT&T a t and t +ATM a t m +AV1 a v one +AV1 av one +AVAS a v a s +AVAS a vas +AVRCP a v r c p +AWS a w s +AWS aws +AlphaFold alpha fold +AstraZeneca astra zeneca +AstraZeneca astrazeneca +Audio2Face audio to face +Audio2Face audio too face +BBC b b c +BCP b c p +BEV b e v +BGA b g a +BGP b g p +BIOS Bios +BIOS bios +BIS b i s +BSP b s p +BasePOD base pod +BasePOD basepod +Billion-X billion X +C# c sharp +C++ c plus plus +CAT5 cat five +CAT5e cat five e +CAT6 cat six +CAT6A cat six a +CAT7 cat seven +CAT8 cat eight +CBC c b c +CCS c c s +CCTV c c t v +CCTV c c tv +CDN c d n +CEO c e o +CES c e s +CFO c f o +CHAdeMO cha demo +CHAdeMO chai demo +CLI c l i +CMO c m o +CMS c m s +CNN c n n +COVID covid +COVID-19 covid nineteen +CPU c p u +CSO c s o +CSP c s p +CSS c s s +CV c v +CVT c v t +CalTech cal tech +CalTech caltech +CapEx cap ex +CapEx capex +ChatGPT chat g p t +CloudXR cloud x r +ConnectX connect x +Core 2 core two +DALI d a l i +DB d b +DC d c +DCDB d c d b +DDR d d r +DDoS d d o s +DDoS d dos +DGL d g l +DGX d g x -1.1 +DGX A100 d g x a hundred -1.1 +DGX A100 d g x a one hundred -1.1 +DGX Station d g x station -1.1 +DHCP d h c p +DLSS d l s s +DLSS 2 d l s s two +DLSS 3 d l s s three +DNA d n a +DNN d n n +DNS d n s +DOS dos +DP d p +DPDT d p d t +DPU d p u +DPX d p x +DeepMind deep mind +DeepMind deepmind +DeepStream deep stream +DeepStream deepstream +DevOps dev ops +Direct3D direct three d +DisplayPort display port +EBC e b c +EBS e b s +EC2 e c two +ECC e c c +ECS e c s +EFS e f s +EGX e g x +EKS e k s +ELB e l b +ELCB e l c b +EMR e m r +EOL e o l +EV e v +Earth-2 earth two +FIFO fee foe +FIFO fifo +FOMO foe moe +FOMO fomo +FPS f p s +FSA f s a +FST f s t +FTP f t p +FTPS f t p s +FX f x +G-SYNC g sync +GCC g c c +GCP g c p +GCS g c s +GFN G f n +GPU g p u +GRPC g r p c +GRU g r u +GSK g s k +GTC g t c +GTX g t x -1.1 +GUI g u i +GUI gui +GeForce geforce +GeForce NOW geforce now +H.264 H two six four +H.264 H two sixty four +H.265 H two six five +H.265 H two sixty five +H100 H one hundred +HDMI h d m i +HEVC h e v c +HGX h g x +HMD Hmd +HPC h p c +HQ h q +HSM h s m +HT h t +HTML h t m l +HTTP h t t p +HTTPS h t t p s +HuggingFace hugging face +I/O i o +ICMP i c m p +IEC i e c +IEEE i triple e +IGMP i g m p +IGX i g x +IISc i i s c +IIT i i t +INT8 int eight +IP i p +IPSec i p sec +ISC i s c +ISI i s i +InfiniBand inifiband +IoT i o t +Isaac SIM isaac sim +JSON j son +JavaScript java script +JavaScript javascript +KVM k v m +LFP lee fei po +LFP life po +LGA l g a -1.1 +LLC l l c +LLM l l m +LLMs l l ms +LLP l l p +LSTM l s t m +LT l t +LTT l t t +LinkedIn linked in +LinkedIn linkedin +MB m b +MCB m c b +MCCB m c c b +MCP m c p +MDM m d m +MFA m f a +MIT m i t +ML m l +MLPerf m l perf +MONAI monai +MWC m w c +MXNet m x net +MacBook mac book +MacBook macbook +MacBook Pro mac book pro +Max-Q max q +Million-X million x +MoCap mo cap +NC n c +NGC n g c +NLP n l p +NLU n l u +NMC n m c +NO n o +NTP n t p +NVDec n v dec +NVDec n v deck +NVEnc n v enc +NVEnc n v inc +NVEnc n v ink +NVMe n v m e +Ni-CD knee cad +NiMH n i m h +NoSQL no s q l +NoSQL no sql +Nvidia A100 nvidia a hundred +Nvidia A100 nvidia a one hundred +Nvidia A30 nvidia a thirty +Nvidia A40 nvidia a forty +OCI o c i +OEM o e m +OEMs o e ms +OLTC o l t c +OLTP o l t p +ONNX o n n x +OVX o v x +OpEx op ex +OpEx opex +OpenCL open c l +OpenGL open g l +PCI p c i +PCIe p c i e -1.1 +PDF p d f +PHEV p h e v +POP3 pop three +PSO p s o +Photoshop photo shop +PhysX fizz ex +PhysX fizz x +PhysX phys ex +PhysX phys x +PlayStation play station +Premiere Pro premiere pro +ProVis pro vis +ProVis provis +PyG pi g +PyG py g +PyTorch pi torch +QODA q o d a +QOS q o s +RCBO r c b o +RCCB r c c b +RDS r d s +RFP r f p +RISC risc +RL r l +RNN r n n +ROI r o i +RSS r s s +RT r t +RTP r t p +RTX r t x -1.1 +RTX ON r t x on +RTX Off r t x off +RacerX racer x +Ryzen 2 rye zen two +Ryzen 3 rye zen three +Ryzen 5 rye zen five +Ryzen 7 rye zen seven +Ryzen 9 rye zen nine +S&P s and p +S3 s three +SAE s a e +SDK s d k +SEO s e o +SFU s f u +SHIELD TV shield tv +SIGGRAPH sig graph +SIGGRAPH siggraph +SMPS s m p s +SMTP s m t p +SNMP s n m p +SNS s n s +SOL s o l +SPDT s p d t +SPN s p n +SPST s p s t +SQL s q l +SQS s q s +SSD s s d +SSH s s h +SSL s s l +SSO s s o +SSR s s r +SUV s u v +SWG s w g +SaaS saas +ServiceNow service now +SuperCloud super cloud +SuperPOD super pod +TCP t c p +TCP/IP t c p i p +TFTP t f t p +TJ t j +TOS t o s +TPM t p m +TPMS t p m s +TPN t p n +TSL t s l +TSMC t s m c +TTS t t s +Telnet tell net +TensorRT tensor r t +Tick-Tock tick tock +TikTok app tick tock app +TikTok is tick tock is +UDP u d p +UEFI u e f i +UFI u f i +UPS u p s +USB u s b +VAAPI v a a p i +VAAPI v a api +VDI v d i +VDPAU v d p a u +VDPAU v d paw +VDPAU v d pow +VFD v f d +VLAN v lan +VP v p +VPN v p n +Vcc v c c +Vdd v d d +VoIP v o i p +WFSA w f s a +WFST w f s t +Wi-Fi wi fi +Wi-Fi wifi +XFX x f x +XFX x fx +XML x m l +Zen 2 zen two +Zen 3 zen three +Zen 4 zen four +c# c sharp +c++ c plus plus +cuBLAS cue blah +cuBLAS cue blahs +cuBLAS q blahs +cuDNN c u d n n +cuDNN cue d n n +cuDNN q d n n +cuOpt cue opt +cuOpt q opt +cuQuantum cue quantum +cuQuantum q quantum +dr. doctor +e.g. for example +es3 e s three +ext4 e x t four +i3 i three +i5 i five +i7 i seven +i9 i nine +iOS ios +iTPMS i t p m s +mr. mister +mrs. misses +on TikTok on tick tock +sFTP s f t p +st. saint +x1 x one +x2 x two +x8 x eight \ No newline at end of file diff --git a/itn/english/data/whitelist_tech.tsv b/itn/english/data/whitelist_tech.tsv new file mode 100644 index 00000000..a2a14a41 --- /dev/null +++ b/itn/english/data/whitelist_tech.tsv @@ -0,0 +1,450 @@ +10K ten k +1D one d +1G one g +1K one k +2.5G two point five g +2D two d +2G two g +2K two k +3D three d +3G three g +4D four d +4G four g +4K four k +5D five d +5G five g +6G six g +7-eleven seven eleven +7G seven g +8K eight k +AAA triple a +AC a c +ACL a c l +ADAS a das +AEB a e b +AES a e s +AGX a g x +AI a i +AI ai +AI ais +ALU a l u +AMD a m d +API a p i +API api +ARP a r p +ASAP a s a p +ASAP a sap +ASMC a s m c +ASR a s r +ASUS a sus +ASUS asus +AT&T a t and t +ATM a t m +AV1 a v one +AV1 av one +AVAS a v a s +AVAS a vas +AVRCP a v r c p +AWS a w s +AWS aws +AlphaFold alpha fold +AstraZeneca astra zeneca +AstraZeneca astrazeneca +Audio2Face audio to face +Audio2Face audio too face +BBC b b c +BCP b c p +BEV b e v +BGA b g a +BGP b g p +BIOS Bios +BIOS bios +BIS b i s +BSP b s p +BasePOD base pod +BasePOD basepod +Billion-X billion X +C# c sharp +C++ c plus plus +CAT5 cat five +CAT5e cat five e +CAT6 cat six +CAT6A cat six a +CAT7 cat seven +CAT8 cat eight +CBC c b c +CCS c c s +CCTV c c t v +CCTV c c tv +CDN c d n +CEO c e o +CES c e s +CFO c f o +CHAdeMO cha demo +CHAdeMO chai demo +CLI c l i +CMO c m o +CMS c m s +CNN c n n +COVID covid +COVID-19 covid nineteen +CPU c p u +CSO c s o +CSP c s p +CSS c s s +CV c v +CVT c v t +CalTech cal tech +CalTech caltech +CapEx cap ex +CapEx capex +ChatGPT chat g p t +CloudXR cloud x r +ConnectX connect x +Core 2 core two +DALI d a l i +DB d b +DC d c +DCDB d c d b +DDR d d r +DDoS d d o s +DDoS d dos +DGL d g l +DGX d g x -1.1 +DGX A100 d g x a hundred -1.1 +DGX A100 d g x a one hundred -1.1 +DGX Station d g x station -1.1 +DHCP d h c p +DLSS d l s s +DLSS 2 d l s s two +DLSS 3 d l s s three +DNA d n a +DNN d n n +DNS d n s +DOS dos +DP d p +DPDT d p d t +DPU d p u +DPX d p x +DeepMind deep mind +DeepMind deepmind +DeepStream deep stream +DeepStream deepstream +DevOps dev ops +Direct3D direct three d +DisplayPort display port +EBC e b c +EBS e b s +EC2 e c two +ECC e c c +ECS e c s +EFS e f s +EGX e g x +EKS e k s +ELB e l b +ELCB e l c b +EMR e m r +EOL e o l +EV e v +Earth-2 earth two +FAT12 fat twelve +FAT16 fat sixteen +FAT32 fat thirty two +FIFO fee foe +FIFO fifo +FOMO foe moe +FOMO fomo +FPS f p s +FSA f s a +FST f s t +FTP f t p +FTPS f t p s +FX f x +G-SYNC g sync +GCC g c c +GCP g c p +GCS g c s +GFN G f n +GPL G P L +GPLv2 G P L two +GPLv3 G P L three +GPU g p u +GRPC g r p c +GRU g r u +GSK g s k +GTC g t c +GTX g t x -1.1 +GUI g u i +GUI gui +GeForce geforce +GeForce NOW geforce now +H.264 H two six four +H.264 H two sixty four +H.265 H two six five +H.265 H two sixty five +H100 H one hundred +HDMI h d m i +HEVC h e v c +HFS h f s +HFS+ h f s plus +HGX h g x +HMD Hmd +HPC h p c +HQ h q +HSM h s m +HT h t +HTML h t m l +HTTP h t t p +HTTPS h t t p s +HuggingFace hugging face +I/O i o +ICMP i c m p +IEC i e c +IEEE i triple e +IGMP i g m p +IGX i g x +IISc i i s c +IIT i i t +INT8 int eight +IP i p +IPSec i p sec +ISC i s c +ISI i s i +InfiniBand inifiband +IoT i o t +Isaac SIM isaac sim +JSON j son +JavaScript java script +JavaScript javascript +KVM k v m +LFP lee fei po +LFP life po +LGA l g a -1.1 +LLC l l c +LLM l l m +LLMs l l ms +LLP l l p +LSTM l s t m +LT l t +LTT l t t +LinkedIn linked in +LinkedIn linkedin +MB m b +MCB m c b +MCCB m c c b +MCP m c p +MDM m d m +MFA m f a +MIT m i t +ML m l +MLPerf m l perf +MONAI monai +MWC m w c +MXNet m x net +MacBook mac book +MacBook macbook +MacBook Pro mac book pro +Max-Q max q +Million-X million x +MoCap mo cap +NC n c +NGC n g c +NLP n l p +NLU n l u +NMC n m c +NO n o +NTFS n t f s +NTP n t p +NVDec n v dec +NVDec n v deck +NVEnc n v enc +NVEnc n v inc +NVEnc n v ink +NVMe n v m e +Ni-CD knee cad +NiMH n i m h +NoSQL no s q l +NoSQL no sql +Nvidia A100 Nvidia a hundred +Nvidia A100 Nvidia a one hundred +Nvidia A100 nvidia a hundred +Nvidia A100 nvidia a one hundred +Nvidia A30 Nvidia a thirty +Nvidia A30 nvidia a thirty +Nvidia A40 Nvidia a forty +Nvidia A40 nvidia a forty +OCI o c i +OEM o e m +OEMs o e ms +OLTC o l t c +OLTP o l t p +ONNX o n n x +OVX o v x +OpEx op ex +OpEx opex +OpenCL open c l +OpenGL open g l +PCI p c i +PCIe p c i e -1.1 +PDF p d f +PHEV p h e v +POP3 pop three +PSO p s o +Photoshop photo shop +PhysX fizz ex +PhysX fizz x +PhysX phys ex +PhysX phys x +PlayStation play station +Premiere Pro premiere pro +ProVis pro vis +ProVis provis +PyG pi g +PyG py g +PyTorch pi torch +QODA q o d a +QOS q o s +RCBO r c b o +RCCB r c c b +RDS r d s +RFP r f p +RISC risc +RL r l +RNN r n n +ROI r o i +RSS r s s +RT r t +RTP r t p +RTX r t x -1.1 +RTX ON r t x on +RTX Off r t x off +RacerX racer x +Ryzen 2 rye zen two +Ryzen 3 rye zen three +Ryzen 5 rye zen five +Ryzen 7 rye zen seven +Ryzen 9 rye zen nine +S&P s and p +S3 s three +SAE s a e +SDK s d k +SEO s e o +SFU s f u +SHIELD TV shield tv +SIGGRAPH sig graph +SIGGRAPH siggraph +SMPS s m p s +SMTP s m t p +SNMP s n m p +SNS s n s +SOL s o l +SPDT s p d t +SPN s p n +SPST s p s t +SQL s q l +SQS s q s +SSD s s d +SSH s s h +SSL s s l +SSO s s o +SSR s s r +SUV s u v +SWG s w g +SaaS saas +ServiceNow service now +SuperCloud super cloud +SuperPOD super pod +TCP t c p +TCP/IP t c p i p +TFTP t f t p +TJ t j +TOS t o s +TPM t p m +TPMS t p m s +TPN t p n +TSL t s l +TSMC t s m c +TTS t t s +Telnet tell net +TensorRT tensor r t +The A100 The a hundred +The A100 The a one hundred +The A100 the a hundred +The A100 the a one hundred +The A30 The a thirty +The A30 the a thirty +The A40 The a forty +The A40 the a forty +Tick-Tock tick tock +TikTok app tick tock app +TikTok is tick tock is +UDP u d p +UEFI u e f i +UFI u f i +UFS u f s +UPS u p s +USB u s b +VAAPI v a a p i +VAAPI v a api +VDI v d i +VDPAU v d p a u +VDPAU v d paw +VDPAU v d pow +VFD v f d +VLAN v lan +VMFS v m f s +VP v p +VPN v p n +Vcc v c c +Vdd v d d +VoIP v o i p +WFSA w f s a +WFST w f s t +Wi-Fi wi fi +Wi-Fi wifi +XFX x f x +XFX x fx +XML x m l +ZFS z f s +Zen 2 zen two +Zen 3 zen three +Zen 4 zen four +b-tree b t ree +btrfs b t r f s +btrfs b tree f s +btrfs butter f s +c# c sharp +c++ c plus plus +cuBLAS cue blah +cuBLAS cue blahs +cuBLAS q blahs +cuDNN c u d n n +cuDNN cue d n n +cuDNN q d n n +cuOpt cue opt +cuOpt q opt +cuQuantum cue quantum +cuQuantum q quantum +dr. doctor +e.g. for example +es3 e s three +exFAT ex fat +ext3 e x t three +ext4 e x t four +i3 i three +i5 i five +i7 i seven +i9 i nine +iOS ios +iTPMS i t p m s +mr. mister +mrs. misses +on TikTok on tick tock +sFTP s f t p +st. saint +x1 x one +x16 x sixteen +x2 x two +x4 x four +x8 x eight +xfs x f s diff --git a/itn/english/data/year_suffix.tsv b/itn/english/data/year_suffix.tsv new file mode 100644 index 00000000..65a05a18 --- /dev/null +++ b/itn/english/data/year_suffix.tsv @@ -0,0 +1,6 @@ +AD a d +BC b c +BCE b c e +BCE before common era +CE c e +CE common era diff --git a/itn/english/test/data/cardinal.txt b/itn/english/test/data/cardinal.txt new file mode 100644 index 00000000..1155712c --- /dev/null +++ b/itn/english/test/data/cardinal.txt @@ -0,0 +1,60 @@ +nine trillion seven hundred eighty nine billion three hundred eighty two million five hundred thirty six thousand one hundred thirty~9789382536130 +two hundred and fifty four~254 +one hundred forty seven thousand four hundred fifty one~147451 +one million one hundred fifty six thousand one hundred seventy three~1156173 +one billion five hundred ninety three million seventy two thousand nine hundred sixty one~1593072961 +One Billion five hundred ninety three Million seventy two thousand nine hundred sixty one~1593072961 +ninety seven quadrillion eight hundred eight trillion two hundred sixty four billion seven hundred seventy two million seven hundred ninety two thousand five~97808264772792005 +seventeen sextillion eight hundred fifty five quintillion thirty six quadrillion six hundred fifty seven trillion seven billion five hundred ninety six million one hundred ten thousand nine hundred forty nine~17855036657007596110949 +ten quadrillion ten trillion ten million one hundred thousand ten~10010000010100010 +minus twenty five thousand thirty seven~-25037 +one quadrillion two hundred sixty four trillion three hundred one billion nine hundred thirty eight million one hundred four~1264301938000104 +minus sixty~-60 +forty six thousand six hundred sixty four~46664 +sixty~60 +zero~zero +two million three~2000003 +one thousand thirteen~1013 +one thousand one~1001 +one thousand one hundred~1100 +one thousand twenty six~1026 +one thousand one hundred twenty six~1126 +eighteen million four hundred fifty thousand nine hundred ninety~18450990 +eighteen million nine hundred forty thousand seven hundred twenty two~18940722 +eighteen million six hundred ninety thousand nine hundred sixteen~18690916 +eighteen thousand eight hundred eighty~18880 +eleven hundred~1100 +twenty one hundred~2100 +twenty one hundred and eleven~2111 +eleven hundred twenty one~1121 +Nine trillion seven hundred eighty nine billion three hundred eighty two million five hundred thirty six thousand one hundred thirty~9789382536130 +Two hundred and fifty four~254 +One hundred forty seven thousand four hundred fifty one~147451 +One million one hundred fifty six thousand one hundred seventy three~1156173 +One billion five hundred ninety three million seventy two thousand nine hundred sixty one~1593072961 +Ninety seven quadrillion eight hundred eight trillion two hundred sixty four billion seven hundred seventy two million seven hundred ninety two thousand five~97808264772792005 +Seventeen sextillion eight hundred fifty five quintillion thirty six quadrillion six hundred fifty seven trillion seven billion five hundred ninety six million one hundred ten thousand nine hundred forty nine~17855036657007596110949 +Ten quadrillion ten trillion ten million one hundred thousand ten~10010000010100010 +Minus twenty five thousand thirty seven~-25037 +One quadrillion two hundred sixty four trillion three hundred one billion nine hundred thirty eight million one hundred four~1264301938000104 +Minus sixty~-60 +Forty six thousand six hundred sixty four~46664 +Sixty~60 +Zero~Zero +Two million three~2000003 +One thousand thirteen~1013 +One thousand one~1001 +One thousand one hundred~1100 +One thousand twenty six~1026 +One thousand one hundred twenty six~1126 +Eighteen million four hundred fifty thousand nine hundred ninety~18450990 +Eighteen million nine hundred forty thousand seven hundred twenty two~18940722 +Eighteen million six hundred ninety thousand nine hundred sixteen~18690916 +Eighteen thousand eight hundred eighty~18880 +Eleven hundred~1100 +Twenty one hundred~2100 +Twenty one hundred and eleven~2111 +Eleven hundred twenty one~1121 +Nineteen~19 +Twelve~Twelve +twenty one crore ninety eight lakh thirty six thousand five hundred and ninety three~219836593 diff --git a/itn/english/test/data/date.txt b/itn/english/test/data/date.txt new file mode 100644 index 00000000..d121cc0a --- /dev/null +++ b/itn/english/test/data/date.txt @@ -0,0 +1,70 @@ +july twenty fifth two thousand twelve~july 25 2012 +nineteen eighties~1980s +two thousand and twenty~2020 +two thousand and nine~2009 +the twenty fifth of july twenty twelve~25 july 2012 +the twenty fifth of july two thousand twelve~25 july 2012 +the twenty second of july twenty twelve~22 july 2012 +the fifteenth of january~15 january +the seventeenth of may twenty ten~17 may 2010 +january first~january 1 +july twenty second two thousand eight~july 22 2008 +june thirty~june 30 +july twenty fifth twenty twelve~july 25 2012 +nineteen seventeen~1917 +twenty twelve~2012 +march sixteen sixty five~march 1665 +sixteen sixty five~1665 +july two thousand twelve~july 2012 +october nineteen oh five~october 1905 +july fifteen o six~july 1506 +the twenty fifth of july twenty twelve~25 july 2012 +july twenty fifth twenty twelve~july 25 2012 +july twenty fifth two thousand twelve~july 25 2012 +july one thousand eight hundred seventy six~july 1876 +february twenty fifth twenty sixteen~february 25 2016 +november twenty fourth twenty fourteen~november 24 2014 +nineteen ninety four~1994 +two thousand three~2003 +one thousand eight~1008 +nineteen seventy six~1976 +june twentieth twenty fourteen~june 20 2014 +nineteen seventy three~1973 +nineteen seventy five~1975 +eleven fifty five~1155 +July twenty fifth two thousand twelve~July 25 2012 +Nineteen eighties~1980s +Two thousand and twenty~2020 +Two thousand and nine~2009 +The twenty fifth of july twenty twelve~25 july 2012 +The twenty fifth of july two thousand twelve~25 july 2012 +The twenty second of july twenty twelve~22 july 2012 +The fifteenth of january~15 january +The fifteenth of January~15 January +The seventeenth of may twenty ten~17 may 2010 +January first~January 1 +July twenty second two thousand eight~July 22 2008 +June thirty~June 30 +July twenty fifth twenty twelve~July 25 2012 +Nineteen seventeen~1917 +Twenty twelve~2012 +March sixteen sixty five~March 1665 +Sixteen sixty five~1665 +July two thousand twelve~July 2012 +October nineteen oh five~October 1905 +July fifteen o six~July 1506 +The twenty fifth of july twenty twelve~25 july 2012 +The twenty fifth of July twenty twelve~25 July 2012 +July twenty fifth twenty twelve~July 25 2012 +July twenty fifth two thousand twelve~July 25 2012 +July one thousand eight hundred seventy six~July 1876 +February twenty fifth twenty sixteen~February 25 2016 +November twenty fourth twenty fourteen~November 24 2014 +Nineteen ninety four~1994 +Two thousand three~2003 +One thousand eight~1008 +Nineteen seventy six~1976 +June twentieth twenty fourteen~June 20 2014 +Nineteen seventy three~1973 +Nineteen seventy five~1975 +Eleven fifty five~1155 diff --git a/itn/english/test/data/decimal.txt b/itn/english/test/data/decimal.txt new file mode 100644 index 00000000..b9cc1ac7 --- /dev/null +++ b/itn/english/test/data/decimal.txt @@ -0,0 +1,129 @@ +five point two million~5.2 million +one hundred sixty four point five eight thousand~164.58 thousand +four hundred million~400 million +fifty billion~50 billion +four hundred five billion~405 billion +four point eight five billion~4.85 billion +one hundred billion~100 billion +one hundred ten billion~110 billion +one hundred thirty two billion~132 billion +one point eight four billion~1.84 billion +one point eight one billion~1.81 billion +one point five nine billion~1.59 billion +one point four five three billion~1.453 billion +one point seven two billion~1.72 billion +one point two five billion~1.25 billion +thirteen billion~13 billion +thirty billion~30 billion +two thousand eight hundred five point eight seven three billion~2805.873 billion +seventy trillion~70 trillion +thirteen million~13 million +eighteen billion~18 billion +four hundred fifty million~450 million +one hundred thirty million~130 million +ten million~10 million +four hundred million~400 million +five million~5 million +five hundred million~500 million +twelve million~12 million +thirteen million~13 million +four million~4 million +forty five million~45 million +fifteen million~15 million +fifteen trillion~15 trillion +fifteen billion~15 billion +two million~2 million +eight million~8 million +point one two o five~.1205 +minus sixty point two four zero zero~-60.2400 +zero point two six~0.26 +point zero two~.02 +sixty point two~60.2 +eighteen~18 +eighteen point eight five~18.85 +eighteen point five o~18.50 +eighteen point five six~18.56 +eighteen point nine~18.9 +eighteen point o five~18.05 +eighteen point one two~18.12 +eighteen point o one~18.01 +eighteen point o o o~18.000 +eighteen point six~18.6 +eighteen point three o o~18.300 +eighteen point three six~18.36 +eighteen point two five~18.25 +eighteen point two two~18.22 +eight hundred eighteen point three o three~818.303 +eight hundred eight point eight~808.8 +eight hundred eight point zero~808.0 +eight hundred eighty eight point one~888.1 +eight hundred eighty four point three~884.3 +eight hundred eighty two point eight~882.8 +eight hundred eighty two point zero~882.0 +eight hundred forty five point nine four~845.94 +Five point two million~5.2 million +One hundred sixty four point five eight thousand~164.58 thousand +Four hundred million~400 million +Fifty billion~50 billion +Four hundred five billion~405 billion +Four point eight five billion~4.85 billion +One hundred billion~100 billion +One hundred ten billion~110 billion +One hundred thirty two billion~132 billion +One point eight four billion~1.84 billion +One point eight four Billion~1.84 Billion +One point eight one billion~1.81 billion +One point five nine billion~1.59 billion +One point four five three billion~1.453 billion +One point seven two billion~1.72 billion +One point two five billion~1.25 billion +Thirteen billion~13 billion +Thirty billion~30 billion +Thirty Billion~30 Billion +Two thousand eight hundred five point eight seven three billion~2805.873 billion +Seventy trillion~70 trillion +Thirteen million~13 million +Eighteen billion~18 billion +Four hundred fifty million~450 million +One hundred thirty million~130 million +Ten million~10 million +Four hundred million~400 million +Five million~5 million +Five hundred million~500 million +Twelve million~12 million +Thirteen million~13 million +Four million~4 million +Forty five million~45 million +Fifteen million~15 million +Fifteen trillion~15 trillion +Fifteen billion~15 billion +Fifteen Billion~15 Billion +Two million~2 million +Eight million~8 million +Point one two o five~.1205 +Minus sixty point two four zero zero~-60.2400 +Zero point two six~0.26 +Point zero two~.02 +Sixty point two~60.2 +Eighteen~18 +Eighteen point eight five~18.85 +Eighteen point five o~18.50 +Eighteen point five six~18.56 +Eighteen point nine~18.9 +Eighteen point o five~18.05 +Eighteen point one two~18.12 +Eighteen point o one~18.01 +Eighteen point o o o~18.000 +Eighteen point six~18.6 +Eighteen point three o o~18.300 +Eighteen point three six~18.36 +Eighteen point two five~18.25 +Eighteen point two two~18.22 +Eight hundred eighteen point three o three~818.303 +Eight hundred eight point eight~808.8 +Eight hundred eight point zero~808.0 +Eight hundred eighty eight point one~888.1 +Eight hundred eighty four point three~884.3 +Eight hundred eighty two point eight~882.8 +Eight hundred eighty two point zero~882.0 +Eight hundred forty five point nine four~845.94 diff --git a/itn/english/test/data/electronic.txt b/itn/english/test/data/electronic.txt new file mode 100644 index 00000000..07df3f04 --- /dev/null +++ b/itn/english/test/data/electronic.txt @@ -0,0 +1,59 @@ +a dot b c at g mail dot com~a.bc@gmail.com +a at gmail dot com~a@gmail.com +a at m s n dot fr~a@msn.fr +a at a o l dot com~a@aol.com +a at m s n dot com~a@msn.com +a at nvidia dot com~a@nvidia.com +a dot b c at nvidia dot com~a.bc@nvidia.com +c d f at a b c dot e d u~cdf@abc.edu +a b c at g mail dot a b c~abc@gmail.abc +a b c at a b c dot com~abc@abc.com +a s d f one two three at a b c dot com~asdf123@abc.com +a one b two at a b c dot com~a1b2@abc.com +a b three dot s d d dot three at g mail dot com~ab3.sdd.3@gmail.com +dot three at g mail dot com~dot 3@gmail.com +one three at g mail dot com~13@gmail.com +a b three hyphen s d d dash three at g mail dot com~ab3-sdd-3@gmail.com +h t t p colon slash slash w w w dot o u r d a i l y n e w s dot com dot s m~http://www.ourdailynews.com.sm +h t t p colon slash slash w w w dot c o m d a i l y n e w s dot a b dot s m~http://www.comdailynews.ab.sm +h t t p colon slash slash w w w dot c o m d a i l y n e w s dot a b slash s m~http://www.comdailynews.ab/sm +w w w dot c o m d a i l y n e w s dot a b slash s m~www.comdailynews.ab/sm +c o m d a i l y n e w s dot a b slash s m~comdailynews.ab/sm +n vidia dot com~nvidia.com +abc at gmail dot com~abc@gmail.com +athreed at gmail dot com~athreed@gmail.com +kore dot ai~kore.ai +A dot b c at g mail dot com~A.bc@gmail.com +A at gmail dot com~A@gmail.com +A at m s n dot fr~A@msn.fr +A at a o l dot com~A@aol.com +A at m s n dot com~A@msn.com +A at nvidia dot com~A@nvidia.com +A dot b c at nvidia dot com~A.bc@nvidia.com +C d f at a b c dot e d u~Cdf@abc.edu +A b c at g mail dot a b c~Abc@gmail.abc +A b c at a b c dot com~Abc@abc.com +A s d f one two three at a b c dot com~Asdf123@abc.com +A one b two at a b c dot com~A1b2@abc.com +A b three dot s d d dot three at g mail dot com~Ab3.sdd.3@gmail.com +Dot three at g mail dot com~Dot 3@gmail.com +One three at g mail dot com~13@gmail.com +A b three hyphen s d d dash three at g mail dot com~Ab3-sdd-3@gmail.com +H t t p colon slash slash w w w dot o u r d a i l y n e w s dot com dot s m~http://www.ourdailynews.com.sm +HTTP colon slash slash w w w dot o u r d a i l y n e w s dot com dot s m~http://www.ourdailynews.com.sm +HTTP colon slash slash w w w dot o u r d a i l y n e w s dot com dot Zero one~http://www.ourdailynews.com.01 +http colon slash slash w w w dot o u r d a i l y n e w s dot com dot SM and s ~http://www.ourdailynews.com.SM and s +Http colon slash slash w w w dot o u r d a i l y n e w s dot COM dot s m~http://www.ourdailynews.COM.sm +H t t p colon slash slash w w w dot c o m d a i l y n e w s dot a b dot s m~http://www.comdailynews.ab.sm +H t t p colon slash slash w w w dot c o m d a i l y n e w s dot a b slash s m~http://www.comdailynews.ab/sm +W w w dot c o m d a i l y n e w s dot a b slash s m~www.comdailynews.ab/sm +C o m d a i l y n e w s dot a b slash s m~Comdailynews.ab/sm +N vidia dot com~nvidia.com +Abc at gmail dot com~Abc@gmail.com +Athreed at gmail dot com~Athreed@gmail.com +Kore dot ai~Kore.ai +NVIDIA dot com~NVIDIA.com +NVIDIA dot COM~NVIDIA.COM +WWW.A B C at A B C dot com~WWW.A BC@ABC.com +W W W. A B C dot com~www.ABC.com +w w w . o u r d a i l y n e w s dot com . s m~www.ourdailynews.com . s m diff --git a/itn/english/test/data/measure.txt b/itn/english/test/data/measure.txt new file mode 100644 index 00000000..1cf435d0 --- /dev/null +++ b/itn/english/test/data/measure.txt @@ -0,0 +1,232 @@ +two hundred meters~200 m +fifty six point three per square kilometer~56.3 /km² +two hundred kilometers per hour~200 km/h +twenty eight kilograms force per square centimeter~28 kgf/cm² +forty two thousand two hundred fifty nine per square meter~42259 /m² +minus two thousand twelve kilo liters~-2012 kl +minus sixty six kilograms~-66 kg +two kilo watt hours~2 kWh +one point o o o o two eight cubic deci meters~1.000028 dm³ +seven point five peta bytes~7.5 pb +three hours~3 h +one milli volt~1 mv +two cubic meters~2 m³ +ninety grams~90 g +one hundred twenty four point three lumens~124.3 lm +four hundred forty milliliters~440 ml +thirty one thousand four hundred eighty square feet~31480 sq ft +one thousand six hundred hours~1600 h +thirty one thousand four hundred eighty square feet~31480 sq ft +two square miles~2 sq mi +zero point one nine square miles~0.19 sq mi +one thousand five hundred thirty one c c~1531 cc +three hundred micrometers~300 μm +sixty five thousand square kilometers~65000 km² +two miles per hour~2 mph +two hundred forty five miles per hour~245 mph +one hundred fifty c c~150 cc +sixty point two four zero zero kilograms~60.2400 kg +zero feet~0 ft +zero foot~0 ft +two feet~2 ft +twenty foot~20 ft +point two meters~.2 m +two square meters~2 m² +eighteen feet~18 ft +eighteen mega siemens~18 ms +eighteen ounces~18 oz +eighteen point five kilometers~18.5 km +eighteen point five two square kilometers~18.52 km² +eighteen point nine one square kilometers~18.91 km² +eighteen point one four percent~18.14 % +eighteen point one six percent~18.16 % +eighteen point one square kilometers~18.1 km² +eighteen point six percent~18.6 % +eighteen point two two kilometers~18.22 km +eighteen point zero kilometers~18.0 km +eighteen point zero percent~18.0 % +eighteen square kilometers~18 km² +eighteen thousand eight hundred giga watt hours~18800 gWh +eighteen thousand seven hundred hectares~18700 ha +eight hectares~8 ha +eight hundred eighty five astronomical units~885 au +eight hundred eighty hectares~880 ha +eight hundred eighty kilobytes~880 kb +eight hundred eighty kilometers~880 km +eight hundred eighty nine feet~889 ft +eight hundred eighty six kilometers~886 km +eight hundred eighty two megawatts~882 mW +eight hundred feet~800 ft +eight hundred fifty five square kilometers~855 km² +eight hundred fifty megahertz~850 mhz +eight hundred fifty meters~850 m +eight hundred fifty nanometers~850 nm +eight hundred fifty one meters~851 m +eight hundred fifty seven square kilometers~857 km² +eight hundred fifty three meters~853 m +eight hundred fifty three point six meters~853.6 m +eight hundred five point four six square kilometers~805.46 km² +eight hundred forty two point nine meters~842.9 m +eight hundred forty two square kilometers~842 km² +eight hundred gigabytes~800 gb +eight hundred horsepower~800 hp +eight hundred kilograms~800 kg +eight hundred kilo watt hours~800 kWh +eight hundred kilowatts~800 kW +eight hundred megahertz~800 mhz +eight hundred ninety four c c~894 cc +eight hundred ninety kilowatts~890 kW +eight hundred ninety millimeters~890 mm +eight hundred ninety two square kilometers~892 km² +eight hundred seventy horsepower~870 hp +eight hundred seventy meters~870 m +eight hundred sixty kilograms~860 kg +eight hundred sixty kilometers~860 km +eight hundred sixty miles~860 mi +eight hundred sixty six feet~866 ft +eight hundred ten hectares~810 ha +eight hundred ten kilohertz~810 khz +eight hundred thirty eight point two millimeters~838.2 mm +eight hundred thirty five kilometers~835 km +eight hundred thirty kilohertz~830 khz +eight hundred thirty megawatts~830 mW +eight hundred thirty nine kilometers~839 km +eight hundred thirty six meters~836 m +eight hundred twenty feet~820 ft +eight hundred twenty kilometers~820 km +eight hundred twenty meters~820 m +eight hundred twenty one point zero feet~821.0 ft +eight hundred two point eight nine kilometers~802.89 km +eight hundred volts~800 v +eight kilobits~8 kb +eight kilograms~8 kg +eight million two hundred thousand feet~8200000 ft +eight point eight kilometers~8.8 km +eight point eight meters~8.8 m +eight point eight miles~8.8 mi +eight point five centimeters~8.5 cm +eight point five five percent~8.55 % +eight point five megawatts~8.5 mW +eight point five meters~8.5 m +eight point five two percent~8.52 % +eight point four four percent~8.44 % +Two hundred meters~200 m +Fifty six point three per square kilometer~56.3 /km² +Two hundred kilometers per hour~200 km/h +Twenty eight kilograms force per square centimeter~28 kgf/cm² +Forty two thousand two hundred fifty nine per square meter~42259 /m² +Minus two thousand twelve kilo liters~-2012 kl +Minus sixty six kilograms~-66 kg +Two kilo watt hours~2 kWh +One point o o o o two eight cubic deci meters~1.000028 dm³ +Seven point five peta bytes~7.5 pb +Three hours~3 h +One milli volt~1 mv +Two cubic meters~2 m³ +Ninety grams~90 g +One hundred twenty four point three lumens~124.3 lm +Four hundred forty milliliters~440 ml +Thirty one thousand four hundred eighty square feet~31480 sq ft +One thousand six hundred hours~1600 h +Thirty one thousand four hundred eighty square feet~31480 sq ft +Two square miles~2 sq mi +Zero point one nine square miles~0.19 sq mi +One thousand five hundred thirty one c c~1531 cc +Three hundred micrometers~300 μm +Sixty five thousand square kilometers~65000 km² +Two miles per hour~2 mph +Two hundred forty five miles per hour~245 mph +One hundred fifty c c~150 cc +Sixty point two four zero zero kilograms~60.2400 kg +Zero feet~0 ft +Zero foot~0 ft +Two feet~2 ft +Twenty foot~20 ft +Point two meters~.2 m +Two square meters~2 m² +Eighteen feet~18 ft +Eighteen mega siemens~18 ms +Eighteen ounces~18 oz +Eighteen point five kilometers~18.5 km +Eighteen point five two square kilometers~18.52 km² +Eighteen point nine one square kilometers~18.91 km² +Eighteen point one four percent~18.14 % +Eighteen point one six percent~18.16 % +Eighteen point one square kilometers~18.1 km² +Eighteen point six percent~18.6 % +Eighteen point two two kilometers~18.22 km +Eighteen point zero kilometers~18.0 km +Eighteen point zero percent~18.0 % +Eighteen square kilometers~18 km² +Eighteen thousand eight hundred giga watt hours~18800 gWh +Eighteen thousand seven hundred hectares~18700 ha +Eight hectares~8 ha +Eight hundred eighty five astronomical units~885 au +Eight hundred eighty hectares~880 ha +Eight hundred eighty kilobytes~880 kb +Eight hundred eighty kilometers~880 km +Eight hundred eighty nine feet~889 ft +Eight hundred eighty six kilometers~886 km +Eight hundred eighty two megawatts~882 mW +Eight hundred feet~800 ft +Eight hundred fifty five square kilometers~855 km² +Eight hundred fifty megahertz~850 mhz +Eight hundred fifty meters~850 m +Eight hundred fifty nanometers~850 nm +Eight hundred fifty one meters~851 m +Eight hundred fifty seven square kilometers~857 km² +Eight hundred fifty three meters~853 m +Eight hundred fifty three point six meters~853.6 m +Eight hundred five point four six square kilometers~805.46 km² +Eight hundred forty two point nine meters~842.9 m +Eight hundred forty two square kilometers~842 km² +Eight hundred gigabytes~800 gb +Eight hundred horsepower~800 hp +Eight hundred kilograms~800 kg +Eight hundred kilo watt hours~800 kWh +Eight hundred kilowatts~800 kW +Eight hundred megahertz~800 mhz +Eight hundred ninety four c c~894 cc +Eight hundred ninety kilowatts~890 kW +Eight hundred ninety millimeters~890 mm +Eight hundred ninety two square kilometers~892 km² +Eight hundred seventy horsepower~870 hp +Eight hundred seventy meters~870 m +Eight hundred sixty kilograms~860 kg +Eight hundred sixty kilometers~860 km +Eight hundred sixty miles~860 mi +Eight hundred sixty six feet~866 ft +Eight hundred ten hectares~810 ha +Eight hundred ten kilohertz~810 khz +Eight hundred thirty eight point two millimeters~838.2 mm +Eight hundred thirty five kilometers~835 km +Eight hundred thirty kilohertz~830 khz +Eight hundred thirty megawatts~830 mW +Eight hundred thirty nine kilometers~839 km +Eight hundred thirty six meters~836 m +Eight hundred twenty feet~820 ft +Eight hundred twenty kilometers~820 km +Eight hundred twenty meters~820 m +Eight hundred twenty one point zero feet~821.0 ft +Eight hundred two point eight nine kilometers~802.89 km +Eight hundred volts~800 v +Eight kilobits~8 kb +Eight KiloBits~8 kb +Eight kilograms~8 kg +Eight million two hundred thousand feet~8200000 ft +Eight point eight kilometers~8.8 km +Eight point eight meters~8.8 m +Eight point eight miles~8.8 mi +Eight point five centimeters~8.5 cm +Eight point five five percent~8.55 % +Eight point five megawatts~8.5 mW +Eight point five meters~8.5 m +Eight point five two percent~8.52 % +Eight point four four percent~8.44 % +one gigabit per second~1 gbps +nine gigabits per second~9 gbps +five degrees celsius~5 °C +seventy two degrees fahrenheit~72 °F +Seventy two Degrees Fahrenheit~72 °F +two hundred seventy three kelvin~273 K +Nine GigaBits per second~9 gbps diff --git a/itn/english/test/data/money.txt b/itn/english/test/data/money.txt new file mode 100644 index 00000000..9a40405a --- /dev/null +++ b/itn/english/test/data/money.txt @@ -0,0 +1,54 @@ +Two dollars~$2 +One cent~$0.01 +Four united states dollars and sixty nine cents~$4.69 +Seventy five dollars sixty three~$75.63 +Twenty nine dollars fifty cents~$29.50 +Eleven dollars and fifty one cents~$11.51 +Nine hundred ninety three dollars and ninety two cents~$993.92 +Four hundred sixty billion won~₩460 billion +Thirty billion yen~¥30 billion +Two point five billion dollars~$2.5 billion +Forty five billion dollars~$45 billion +Fifty million dollars~$50 million +Fifty billion dollars~$50 billion +Zero point two million dollars~$0.2 million +Fifteen point two billion dollars~$15.2 billion +One point six nine billion yuan~1.69 billion yuan +One point four three six billion yuan~1.436 billion yuan +Four million yuan~4 million yuan +One dollar~$1 +Fifteen thousand dollars~$15000 +One dollars~One dollars +Twenty dollar~$20 +Twenty point five o six dollars~$20.506 +Point five o six dollars~$.506 +Eighteen dollars~$18 +Eighteen million nine hundred twenty five thousand dollars~$18925000 +Eighteen thousand eight hundred fifty four dollars~$18854 +Eighteen thousand eight hundred one dollars~$18801 +Eighteen thousand eight hundred seventy five dollars~$18875 +Eighteen thousand eighty one dollars~$18081 +Eighteen thousand fifty two dollars~$18052 +Eighteen thousand five hundred forty two dollars~$18542 +Eighteen thousand five hundred nineteen dollars~$18519 +Eighteen thousand five hundred seventy dollars~$18570 +Eighteen thousand five hundred seventy eight dollars~$18578 +Eighteen thousand five hundred sixteen dollars~$18516 +Eighteen thousand four hundred eighty two dollars~$18482 +Eighteen thousand four hundred seventy eight dollars~$18478 +Eighteen thousand four hundred sixty eight dollars~$18468 +Eighteen thousand nine hundred three dollars~$18903 +Eighteen thousand nine hundred twenty nine dollars~$18929 +Eighteen thousand ninety five dollars~$18095 +Eighteen thousand one hundred seventeen dollars~$18117 +Eighteen thousand one hundred twenty eight dollars~$18128 +Eighteen thousand one hundred twenty five dollars~$18125 +Eighteen thousand one hundred twenty four dollars~$18124 +Eighteen thousand one hundred twenty nine dollars~$18129 +One thousand fifty five dollars~$1055 +One fifty five dollars~$155 +Fifteen hundred dollars~$1500 +Ninety nine hundred dollars~$9900 +Ninety nine hundred and fifteen dollars and one cent~$9915.01 +zero point two Million Dollars~$0.2 Million +One Cent~$0.01 diff --git a/itn/english/test/data/ordinal.txt b/itn/english/test/data/ordinal.txt new file mode 100644 index 00000000..28b5af81 --- /dev/null +++ b/itn/english/test/data/ordinal.txt @@ -0,0 +1,34 @@ +One hundredth~100th +Twenty five thousand one hundred eleventh~25111th +Second~2nd +Zeroth~0th +First~1st +Second~2nd +Third~3rd +Fourth~4th +Eleventh~11th +Twelfth~12th +Thirteenth~13th +Twenty first~21st +Twenty third~23rd +One hundred eleventh~111th +One thousandth~1000th +One hundred twenty first~121st +Eleven hundred twenty first~1121st +Second~2nd +Tenth~10th +Sixth~6th +Third~3rd +Nineteenth~19th +Third~3rd +Twelfth~12th +Forty eighth~48th +Seventy first~71st +Third~3rd +Forty second~42nd +Seventeenth~17th +Twentieth~20th +Twenty first~21st +Seventh~7th +Second~2nd +Fifth~5th diff --git a/itn/english/test/data/telephone.txt b/itn/english/test/data/telephone.txt new file mode 100644 index 00000000..cfb2e73d --- /dev/null +++ b/itn/english/test/data/telephone.txt @@ -0,0 +1,18 @@ +One two three one two three five six seven eight~123-123-5678 +Plus nine one one two three one two three five six seven eight~+91 123-123-5678 +Plus forty four one two three one two three five six seven eight~+44 123-123-5678 +Four one two three one two three five six seven eight~4 123-123-5678 +Zero two three one two three five six seven eight~023-123-5678 +O two three one two three five six seven eight~023-123-5678 +Oh two three one two three five six seven eight~023-123-5678 +Double oh three one two three five six seven eight~003-123-5678 +Four three two double seven three two one four three two one four three double zero five~432 7732 143214 3005 +One two three dot one two three dot o dot four o~123.123.0.40 +One twenty three dot one two three dot o dot four o~123.123.0.40 +Two two five dot double five dot o dot four o~225.55.0.40 +Two two five dot double five dot o dot forty five~225.55.0.45 +SSN is seven double nine one two three double one three~SSN is 799-12-3113 +Seven nine nine~799 +A b nine~Ab9 +A b c~A b c +Five w k r a three one~5wkra31 diff --git a/itn/english/test/data/time.txt b/itn/english/test/data/time.txt new file mode 100644 index 00000000..ac76b36b --- /dev/null +++ b/itn/english/test/data/time.txt @@ -0,0 +1,33 @@ +Eight oclock g m t~08:00 gmt +Seven a m e s t~07:00 a.m. est +Two p m~02:00 p.m. +Seven A M E S T~07:00 A.M. EST +Two P M~02:00 P.M. +Two thirty~02:30 +Three o'clock~03:00 +Quarter past one~01:15 +Half past three~03:30 +Eight fifty one~08:51 +Eight fifty two~08:52 +Eight forty~08:40 +Eight nineteen~08:19 +Eight o six~08:06 +Eight thirty eight~08:38 +Eight thirty two~08:32 +Eight twenty nine~08:29 +Eleven fifty five p m~11:55 p.m. +Eleven fifty three p m~11:53 p.m. +Eleven forty a m~11:40 a.m. +Eleven forty five a m~11:45 a.m. +Eleven forty p m~11:40 p.m. +Eleven forty six a m~11:46 a.m. +Eleven o six p m~11:06 p.m. +Eleven thirteen a m~11:13 a.m. +Half past twelve~12:30 +Quarter past one~01:15 +Quarter to one~12:45 +Quarter to twelve~11:45 +Set alarm at ten to eleven pm~Set alarm at 10:50 p.m. +One min to one am~12:59 a.m. +eleven Forty six A M~11:46 A.M. +eleven forty six AM~11:46 A.M. diff --git a/itn/english/test/data/whitelist.txt b/itn/english/test/data/whitelist.txt new file mode 100644 index 00000000..684c321d --- /dev/null +++ b/itn/english/test/data/whitelist.txt @@ -0,0 +1,9 @@ +Doctor dao~Dr. dao +Misses Smith~Mrs. Smith +Misses smith~Mrs. smith +Mister Dao~Mr. Dao +Saint George~St. George +I like for example ice cream~I like e.g. ice cream +S and p five hundred~S&P 500 +S and P Five hundred~S&P 500 +Seven eleven stores~7-eleven stores diff --git a/itn/english/test/data/word.txt b/itn/english/test/data/word.txt new file mode 100644 index 00000000..3d02d716 --- /dev/null +++ b/itn/english/test/data/word.txt @@ -0,0 +1,49 @@ +E s three~Es3 +Yahoo!~Yahoo! +Twenty!~20 ! +X ~X +—~— +Aaa~Aaa +Aabach~Aabach +Aabenraa~Aabenraa +Aabye~Aabye +Aaccessed~Aaccessed +Aach~Aach +Aachen's~Aachen's +Aadri~Aadri +Aafia~Aafia +Aagaard~Aagaard +Aagadu~Aagadu +Aagard~Aagard +Aagathadi~Aagathadi +Aaghart's~Aaghart's +Aagnes~Aagnes +Aagomoni~Aagomoni +Aagon~Aagon +Aagoo~Aagoo +Aagot~Aagot +Aahar~Aahar +Aahh~Aahh +Aahperd~Aahperd +Aaibinterstate~Aaibinterstate +Aajab~Aajab +Aakasa~Aakasa +Aakervik~Aakervik +Aakirkeby~Aakirkeby +Aalam~Aalam +Aalbaek~Aalbaek +Aaldiu~Aaldiu +Aalem~Aalem +A'ali~A'ali +Aalilaassamthey~Aalilaassamthey +Aalin~Aalin +Aaliyan~Aaliyan +Aaliyan's~Aaliyan's +Aamadu~Aamadu +Aamara~Aamara +Aambala~Aambala +Aamera~Aamera +Aamer's~Aamer's +Aamina~Aamina +Aaminah~Aaminah +Aamjiwnaang~Aamjiwnaang From c8483ac5227e0a92ccf6902b64c850b1afba18d5 Mon Sep 17 00:00:00 2001 From: pengzhendong <275331498@qq.com> Date: Mon, 1 Dec 2025 13:52:22 +0800 Subject: [PATCH 2/4] add word and cardinal --- itn/english/__init__.py | 0 itn/english/inverse_normalizer.py | 40 +++ itn/english/rules/__init__.py | 0 itn/english/rules/cardinal.py | 216 +++++++++++++ itn/english/rules/utils.py | 34 ++ itn/english/rules/word.py | 35 ++ itn/english/test/__init__.py | 13 + itn/english/test/data/cardinal.txt | 120 +++---- itn/english/test/data/date.txt | 140 ++++---- itn/english/test/data/decimal.txt | 258 +++++++-------- itn/english/test/data/electronic.txt | 118 +++---- itn/english/test/data/measure.txt | 464 +++++++++++++-------------- itn/english/test/data/money.txt | 108 +++---- itn/english/test/data/ordinal.txt | 68 ++-- itn/english/test/data/telephone.txt | 36 +-- itn/english/test/data/time.txt | 66 ++-- itn/english/test/data/whitelist.txt | 18 +- itn/english/test/data/word.txt | 98 +++--- itn/english/test/normalizer_test.py | 36 +++ itn/english/test/utils.py | 1 + 20 files changed, 1122 insertions(+), 747 deletions(-) create mode 100644 itn/english/__init__.py create mode 100644 itn/english/inverse_normalizer.py create mode 100644 itn/english/rules/__init__.py create mode 100644 itn/english/rules/cardinal.py create mode 100644 itn/english/rules/utils.py create mode 100644 itn/english/rules/word.py create mode 100644 itn/english/test/__init__.py create mode 100644 itn/english/test/normalizer_test.py create mode 120000 itn/english/test/utils.py diff --git a/itn/english/__init__.py b/itn/english/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/itn/english/inverse_normalizer.py b/itn/english/inverse_normalizer.py new file mode 100644 index 00000000..126cddbc --- /dev/null +++ b/itn/english/inverse_normalizer.py @@ -0,0 +1,40 @@ +# Copyright (c) 2022 Xingchen Song (sxc19@tsinghua.org.cn) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from importlib_resources import files +from pynini.lib.pynutil import add_weight, delete + +from itn.english.rules.cardinal import Cardinal +from itn.english.rules.word import Word +from tn.processor import Processor + + +class InverseNormalizer(Processor): + + def __init__(self, cache_dir=None, overwrite_cache=False): + super().__init__(name="en_inverse_normalizer", ordertype="itn") + if cache_dir is None: + cache_dir = files("itn") + self.build_fst("en_itn", cache_dir, overwrite_cache) + + def build_tagger(self): + tagger = (add_weight(Cardinal().tagger, 1.0) | add_weight(Word().tagger, 100)).optimize() + + tagger = tagger.star + # remove the last space + self.tagger = tagger @ self.build_rule(delete(" "), "", "[EOS]") + + def build_verbalizer(self): + verbalizer = (Cardinal().verbalizer | Word().verbalizer).optimize() + self.verbalizer = verbalizer.star diff --git a/itn/english/rules/__init__.py b/itn/english/rules/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/itn/english/rules/cardinal.py b/itn/english/rules/cardinal.py new file mode 100644 index 00000000..5b40b84b --- /dev/null +++ b/itn/english/rules/cardinal.py @@ -0,0 +1,216 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pynini import closure, cross, difference, project, string_file, union +from pynini.lib.pynutil import delete, insert + +from itn.english.rules.utils import num_to_word +from tn.processor import Processor +from tn.utils import get_abs_path + + +class Cardinal(Processor): + """ + Finite state transducer for classifying cardinals + e.g. minus twenty three -> cardinal { integer: "23" negative: "-" } } + Numbers below thirteen are not converted. + + Args: + input_case: accepting either "lower_cased" or "cased" input. + """ + + def __init__(self): + super().__init__("cardinal") + self.build_tagger() + self.build_verbalizer() + + def delete_word(self, word: str): + """Capitalizes word for `cased` input""" + delete_graph = delete(word).optimize() + if len(word) > 0: + delete_graph |= delete(word[0].upper() + word[1:]) + return delete_graph.optimize() + + def build_tagger(self): + graph_zero = string_file(get_abs_path("../itn/english/data/numbers/zero.tsv")) + graph_digit = string_file(get_abs_path("../itn/english/data/numbers/digit.tsv")) + graph_ties = string_file(get_abs_path("../itn/english/data/numbers/ties.tsv")) + graph_teen = string_file(get_abs_path("../itn/english/data/numbers/teen.tsv")) + self.graph_two_digit = graph_teen | ((graph_ties) + self.DELETE_SPACE + (graph_digit | insert("0"))) + graph_hundred = cross("hundred", "") + + graph_hundred_component = union(graph_digit + self.DELETE_SPACE + graph_hundred, insert("0")) + graph_hundred_component += self.DELETE_SPACE + graph_hundred_component += union( + graph_teen | insert("00"), (graph_ties | insert("0")) + self.DELETE_SPACE + (graph_digit | insert("0")) + ) + + graph_hundred_component_at_least_one_none_zero_digit = graph_hundred_component @ ( + closure(self.DIGIT) + (self.DIGIT - "0") + closure(self.DIGIT) + ) + self.graph_hundred_component_at_least_one_none_zero_digit = graph_hundred_component_at_least_one_none_zero_digit + + # Transducer for eleven hundred -> 1100 or twenty one hundred eleven -> 2111 + graph_hundred_as_thousand = union(graph_teen, graph_ties + self.DELETE_SPACE + graph_digit) + graph_hundred_as_thousand += self.DELETE_SPACE + graph_hundred + graph_hundred_as_thousand += self.DELETE_SPACE + union( + graph_teen | insert("00"), + (graph_ties | insert("0")) + self.DELETE_SPACE + (graph_digit | insert("0")), + ) + + graph_hundreds = graph_hundred_component | graph_hundred_as_thousand + + graph_ties_component = union( + graph_teen | insert("00"), + (graph_ties | insert("0")) + self.DELETE_SPACE + (graph_digit | insert("0")), + ) + + graph_ties_component_at_least_one_none_zero_digit = graph_ties_component @ ( + closure(self.DIGIT) + (self.DIGIT - "0") + closure(self.DIGIT) + ) + self.graph_ties_component_at_least_one_none_zero_digit = graph_ties_component_at_least_one_none_zero_digit + + # %%% International numeric format + graph_thousands = union( + graph_hundred_component_at_least_one_none_zero_digit + self.DELETE_SPACE + self.delete_word("thousand"), + insert("000", weight=0.1), + ) + + graph_million = union( + graph_hundred_component_at_least_one_none_zero_digit + self.DELETE_SPACE + self.delete_word("million"), + insert("000", weight=0.1), + ) + graph_billion = union( + graph_hundred_component_at_least_one_none_zero_digit + self.DELETE_SPACE + self.delete_word("billion"), + insert("000", weight=0.1), + ) + graph_trillion = union( + graph_hundred_component_at_least_one_none_zero_digit + self.DELETE_SPACE + self.delete_word("trillion"), + insert("000", weight=0.1), + ) + graph_quadrillion = union( + graph_hundred_component_at_least_one_none_zero_digit + self.DELETE_SPACE + self.delete_word("quadrillion"), + insert("000", weight=0.1), + ) + graph_quintillion = union( + graph_hundred_component_at_least_one_none_zero_digit + self.DELETE_SPACE + self.delete_word("quintillion"), + insert("000", weight=0.1), + ) + graph_sextillion = union( + graph_hundred_component_at_least_one_none_zero_digit + self.DELETE_SPACE + self.delete_word("sextillion"), + insert("000", weight=0.1), + ) + # %%% + + graph_int = ( + graph_sextillion + + self.DELETE_SPACE + + graph_quintillion + + self.DELETE_SPACE + + graph_quadrillion + + self.DELETE_SPACE + + graph_trillion + + self.DELETE_SPACE + + graph_billion + + self.DELETE_SPACE + + graph_million + + self.DELETE_SPACE + + graph_thousands + ) + + # %% Indian numeric format simple https://en.wikipedia.org/wiki/Indian_numbering_system + # This only covers "standard format". + # Conventional format like thousand crores/lakh crores is yet to be implemented + graph_in_thousands = union( + graph_ties_component_at_least_one_none_zero_digit + self.DELETE_SPACE + delete("thousand"), + insert("00", weight=0.1), + ) + graph_in_lakhs = union( + graph_ties_component_at_least_one_none_zero_digit + self.DELETE_SPACE + (delete("lakh") | delete("lakhs")), + insert("00", weight=0.1), + ) + + graph_in_crores = union( + graph_ties_component_at_least_one_none_zero_digit + + self.DELETE_SPACE + + (delete("crore") | delete("crores")), + insert("00", weight=0.1), + ) + + graph_in_arabs = union( + graph_ties_component_at_least_one_none_zero_digit + self.DELETE_SPACE + (delete("arab") | delete("arabs")), + insert("00", weight=0.1), + ) + + graph_in_kharabs = union( + graph_ties_component_at_least_one_none_zero_digit + + self.DELETE_SPACE + + (delete("kharab") | delete("kharabs")), + insert("00", weight=0.1), + ) + + graph_in_nils = union( + graph_ties_component_at_least_one_none_zero_digit + self.DELETE_SPACE + (delete("nil") | delete("nils")), + insert("00", weight=0.1), + ) + + graph_in_padmas = union( + graph_ties_component_at_least_one_none_zero_digit + + self.DELETE_SPACE + + (delete("padma") | delete("padmas")), + insert("00", weight=0.1), + ) + + graph_in_shankhs = union( + graph_ties_component_at_least_one_none_zero_digit + + self.DELETE_SPACE + + (delete("shankh") | delete("shankhs")), + insert("00", weight=0.1), + ) + + graph_ind = ( + graph_in_shankhs + + self.DELETE_SPACE + + graph_in_padmas + + self.DELETE_SPACE + + graph_in_nils + + self.DELETE_SPACE + + graph_in_kharabs + + self.DELETE_SPACE + + graph_in_arabs + + self.DELETE_SPACE + + graph_in_crores + + self.DELETE_SPACE + + graph_in_lakhs + + self.DELETE_SPACE + + graph_in_thousands + ) + + graph = union((graph_int | graph_ind) + self.DELETE_SPACE + graph_hundreds, graph_zero) + graph = graph @ union(delete(closure("0")) + difference(self.DIGIT, "0") + closure(self.DIGIT), "0") + + labels_exception = [num_to_word(x) for x in range(0, 13)] + labels_exception += [x.capitalize() for x in labels_exception] + graph_exception = union(*labels_exception).optimize() + + graph = (self.build_rule(delete("and"), self.SPACE, self.SPACE) @ (self.ALPHA + self.SIGMA) @ graph).optimize() + graph |= ((self.TO_LOWER + self.SIGMA) @ graph).optimize() + + self.graph_no_exception = graph + self.graph = (project(graph, "input") - graph_exception.arcsort()) @ graph + tagger = ( + insert('value: "') + cross(union("minus", "Minus") + self.DELETE_SPACE, "-").ques + self.graph + insert('"') + ) + self.tagger = self.add_tokens(tagger).optimize() diff --git a/itn/english/rules/utils.py b/itn/english/rules/utils.py new file mode 100644 index 00000000..d73d7b9d --- /dev/null +++ b/itn/english/rules/utils.py @@ -0,0 +1,34 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Union + +import inflect + +_inflect = inflect.engine() + + +def num_to_word(x: Union[str, int]): + """ + converts integer to spoken representation + + Args + x: integer + + Returns: spoken representation + """ + if isinstance(x, int): + x = str(x) + x = _inflect.number_to_words(str(x)).replace("-", " ").replace(",", "") + return x diff --git a/itn/english/rules/word.py b/itn/english/rules/word.py new file mode 100644 index 00000000..1503b209 --- /dev/null +++ b/itn/english/rules/word.py @@ -0,0 +1,35 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pynini import closure +from pynini.lib.pynutil import insert + +from tn.processor import Processor + + +class Word(Processor): + """ + Finite state transducer for classifying plain tokens, that do not belong to any special class. This can be considered as the default class. + e.g. sleep -> word { value: "sleep" } + """ + + def __init__(self): + super().__init__("word") + self.build_tagger() + self.build_verbalizer() + + def build_tagger(self): + word = insert('value: "') + self.NOT_SPACE.star + insert('"') + self.tagger = self.add_tokens(word.optimize()) diff --git a/itn/english/test/__init__.py b/itn/english/test/__init__.py new file mode 100644 index 00000000..8b82801f --- /dev/null +++ b/itn/english/test/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/itn/english/test/data/cardinal.txt b/itn/english/test/data/cardinal.txt index 1155712c..270837ba 100644 --- a/itn/english/test/data/cardinal.txt +++ b/itn/english/test/data/cardinal.txt @@ -1,60 +1,60 @@ -nine trillion seven hundred eighty nine billion three hundred eighty two million five hundred thirty six thousand one hundred thirty~9789382536130 -two hundred and fifty four~254 -one hundred forty seven thousand four hundred fifty one~147451 -one million one hundred fifty six thousand one hundred seventy three~1156173 -one billion five hundred ninety three million seventy two thousand nine hundred sixty one~1593072961 -One Billion five hundred ninety three Million seventy two thousand nine hundred sixty one~1593072961 -ninety seven quadrillion eight hundred eight trillion two hundred sixty four billion seven hundred seventy two million seven hundred ninety two thousand five~97808264772792005 -seventeen sextillion eight hundred fifty five quintillion thirty six quadrillion six hundred fifty seven trillion seven billion five hundred ninety six million one hundred ten thousand nine hundred forty nine~17855036657007596110949 -ten quadrillion ten trillion ten million one hundred thousand ten~10010000010100010 -minus twenty five thousand thirty seven~-25037 -one quadrillion two hundred sixty four trillion three hundred one billion nine hundred thirty eight million one hundred four~1264301938000104 -minus sixty~-60 -forty six thousand six hundred sixty four~46664 -sixty~60 -zero~zero -two million three~2000003 -one thousand thirteen~1013 -one thousand one~1001 -one thousand one hundred~1100 -one thousand twenty six~1026 -one thousand one hundred twenty six~1126 -eighteen million four hundred fifty thousand nine hundred ninety~18450990 -eighteen million nine hundred forty thousand seven hundred twenty two~18940722 -eighteen million six hundred ninety thousand nine hundred sixteen~18690916 -eighteen thousand eight hundred eighty~18880 -eleven hundred~1100 -twenty one hundred~2100 -twenty one hundred and eleven~2111 -eleven hundred twenty one~1121 -Nine trillion seven hundred eighty nine billion three hundred eighty two million five hundred thirty six thousand one hundred thirty~9789382536130 -Two hundred and fifty four~254 -One hundred forty seven thousand four hundred fifty one~147451 -One million one hundred fifty six thousand one hundred seventy three~1156173 -One billion five hundred ninety three million seventy two thousand nine hundred sixty one~1593072961 -Ninety seven quadrillion eight hundred eight trillion two hundred sixty four billion seven hundred seventy two million seven hundred ninety two thousand five~97808264772792005 -Seventeen sextillion eight hundred fifty five quintillion thirty six quadrillion six hundred fifty seven trillion seven billion five hundred ninety six million one hundred ten thousand nine hundred forty nine~17855036657007596110949 -Ten quadrillion ten trillion ten million one hundred thousand ten~10010000010100010 -Minus twenty five thousand thirty seven~-25037 -One quadrillion two hundred sixty four trillion three hundred one billion nine hundred thirty eight million one hundred four~1264301938000104 -Minus sixty~-60 -Forty six thousand six hundred sixty four~46664 -Sixty~60 -Zero~Zero -Two million three~2000003 -One thousand thirteen~1013 -One thousand one~1001 -One thousand one hundred~1100 -One thousand twenty six~1026 -One thousand one hundred twenty six~1126 -Eighteen million four hundred fifty thousand nine hundred ninety~18450990 -Eighteen million nine hundred forty thousand seven hundred twenty two~18940722 -Eighteen million six hundred ninety thousand nine hundred sixteen~18690916 -Eighteen thousand eight hundred eighty~18880 -Eleven hundred~1100 -Twenty one hundred~2100 -Twenty one hundred and eleven~2111 -Eleven hundred twenty one~1121 -Nineteen~19 -Twelve~Twelve -twenty one crore ninety eight lakh thirty six thousand five hundred and ninety three~219836593 +nine trillion seven hundred eighty nine billion three hundred eighty two million five hundred thirty six thousand one hundred thirty => 9789382536130 +two hundred and fifty four => 254 +one hundred forty seven thousand four hundred fifty one => 147451 +one million one hundred fifty six thousand one hundred seventy three => 1156173 +one billion five hundred ninety three million seventy two thousand nine hundred sixty one => 1593072961 +One Billion five hundred ninety three Million seventy two thousand nine hundred sixty one => 1593072961 +ninety seven quadrillion eight hundred eight trillion two hundred sixty four billion seven hundred seventy two million seven hundred ninety two thousand five => 97808264772792005 +seventeen sextillion eight hundred fifty five quintillion thirty six quadrillion six hundred fifty seven trillion seven billion five hundred ninety six million one hundred ten thousand nine hundred forty nine => 17855036657007596110949 +ten quadrillion ten trillion ten million one hundred thousand ten => 10010000010100010 +minus twenty five thousand thirty seven => -25037 +one quadrillion two hundred sixty four trillion three hundred one billion nine hundred thirty eight million one hundred four => 1264301938000104 +minus sixty => -60 +forty six thousand six hundred sixty four => 46664 +sixty => 60 +zero => zero +two million three => 2000003 +one thousand thirteen => 1013 +one thousand one => 1001 +one thousand one hundred => 1100 +one thousand twenty six => 1026 +one thousand one hundred twenty six => 1126 +eighteen million four hundred fifty thousand nine hundred ninety => 18450990 +eighteen million nine hundred forty thousand seven hundred twenty two => 18940722 +eighteen million six hundred ninety thousand nine hundred sixteen => 18690916 +eighteen thousand eight hundred eighty => 18880 +eleven hundred => 1100 +twenty one hundred => 2100 +twenty one hundred and eleven => 2111 +eleven hundred twenty one => 1121 +Nine trillion seven hundred eighty nine billion three hundred eighty two million five hundred thirty six thousand one hundred thirty => 9789382536130 +Two hundred and fifty four => 254 +One hundred forty seven thousand four hundred fifty one => 147451 +One million one hundred fifty six thousand one hundred seventy three => 1156173 +One billion five hundred ninety three million seventy two thousand nine hundred sixty one => 1593072961 +Ninety seven quadrillion eight hundred eight trillion two hundred sixty four billion seven hundred seventy two million seven hundred ninety two thousand five => 97808264772792005 +Seventeen sextillion eight hundred fifty five quintillion thirty six quadrillion six hundred fifty seven trillion seven billion five hundred ninety six million one hundred ten thousand nine hundred forty nine => 17855036657007596110949 +Ten quadrillion ten trillion ten million one hundred thousand ten => 10010000010100010 +Minus twenty five thousand thirty seven => -25037 +One quadrillion two hundred sixty four trillion three hundred one billion nine hundred thirty eight million one hundred four => 1264301938000104 +Minus sixty => -60 +Forty six thousand six hundred sixty four => 46664 +Sixty => 60 +Zero => Zero +Two million three => 2000003 +One thousand thirteen => 1013 +One thousand one => 1001 +One thousand one hundred => 1100 +One thousand twenty six => 1026 +One thousand one hundred twenty six => 1126 +Eighteen million four hundred fifty thousand nine hundred ninety => 18450990 +Eighteen million nine hundred forty thousand seven hundred twenty two => 18940722 +Eighteen million six hundred ninety thousand nine hundred sixteen => 18690916 +Eighteen thousand eight hundred eighty => 18880 +Eleven hundred => 1100 +Twenty one hundred => 2100 +Twenty one hundred and eleven => 2111 +Eleven hundred twenty one => 1121 +Nineteen => 19 +Twelve => Twelve +twenty one crore ninety eight lakh thirty six thousand five hundred and ninety three => 219836593 diff --git a/itn/english/test/data/date.txt b/itn/english/test/data/date.txt index d121cc0a..5845363c 100644 --- a/itn/english/test/data/date.txt +++ b/itn/english/test/data/date.txt @@ -1,70 +1,70 @@ -july twenty fifth two thousand twelve~july 25 2012 -nineteen eighties~1980s -two thousand and twenty~2020 -two thousand and nine~2009 -the twenty fifth of july twenty twelve~25 july 2012 -the twenty fifth of july two thousand twelve~25 july 2012 -the twenty second of july twenty twelve~22 july 2012 -the fifteenth of january~15 january -the seventeenth of may twenty ten~17 may 2010 -january first~january 1 -july twenty second two thousand eight~july 22 2008 -june thirty~june 30 -july twenty fifth twenty twelve~july 25 2012 -nineteen seventeen~1917 -twenty twelve~2012 -march sixteen sixty five~march 1665 -sixteen sixty five~1665 -july two thousand twelve~july 2012 -october nineteen oh five~october 1905 -july fifteen o six~july 1506 -the twenty fifth of july twenty twelve~25 july 2012 -july twenty fifth twenty twelve~july 25 2012 -july twenty fifth two thousand twelve~july 25 2012 -july one thousand eight hundred seventy six~july 1876 -february twenty fifth twenty sixteen~february 25 2016 -november twenty fourth twenty fourteen~november 24 2014 -nineteen ninety four~1994 -two thousand three~2003 -one thousand eight~1008 -nineteen seventy six~1976 -june twentieth twenty fourteen~june 20 2014 -nineteen seventy three~1973 -nineteen seventy five~1975 -eleven fifty five~1155 -July twenty fifth two thousand twelve~July 25 2012 -Nineteen eighties~1980s -Two thousand and twenty~2020 -Two thousand and nine~2009 -The twenty fifth of july twenty twelve~25 july 2012 -The twenty fifth of july two thousand twelve~25 july 2012 -The twenty second of july twenty twelve~22 july 2012 -The fifteenth of january~15 january -The fifteenth of January~15 January -The seventeenth of may twenty ten~17 may 2010 -January first~January 1 -July twenty second two thousand eight~July 22 2008 -June thirty~June 30 -July twenty fifth twenty twelve~July 25 2012 -Nineteen seventeen~1917 -Twenty twelve~2012 -March sixteen sixty five~March 1665 -Sixteen sixty five~1665 -July two thousand twelve~July 2012 -October nineteen oh five~October 1905 -July fifteen o six~July 1506 -The twenty fifth of july twenty twelve~25 july 2012 -The twenty fifth of July twenty twelve~25 July 2012 -July twenty fifth twenty twelve~July 25 2012 -July twenty fifth two thousand twelve~July 25 2012 -July one thousand eight hundred seventy six~July 1876 -February twenty fifth twenty sixteen~February 25 2016 -November twenty fourth twenty fourteen~November 24 2014 -Nineteen ninety four~1994 -Two thousand three~2003 -One thousand eight~1008 -Nineteen seventy six~1976 -June twentieth twenty fourteen~June 20 2014 -Nineteen seventy three~1973 -Nineteen seventy five~1975 -Eleven fifty five~1155 +july twenty fifth two thousand twelve => july 25 2012 +nineteen eighties => 1980s +two thousand and twenty => 2020 +two thousand and nine => 2009 +the twenty fifth of july twenty twelve => 25 july 2012 +the twenty fifth of july two thousand twelve => 25 july 2012 +the twenty second of july twenty twelve => 22 july 2012 +the fifteenth of january => 15 january +the seventeenth of may twenty ten => 17 may 2010 +january first => january 1 +july twenty second two thousand eight => july 22 2008 +june thirty => june 30 +july twenty fifth twenty twelve => july 25 2012 +nineteen seventeen => 1917 +twenty twelve => 2012 +march sixteen sixty five => march 1665 +sixteen sixty five => 1665 +july two thousand twelve => july 2012 +october nineteen oh five => october 1905 +july fifteen o six => july 1506 +the twenty fifth of july twenty twelve => 25 july 2012 +july twenty fifth twenty twelve => july 25 2012 +july twenty fifth two thousand twelve => july 25 2012 +july one thousand eight hundred seventy six => july 1876 +february twenty fifth twenty sixteen => february 25 2016 +november twenty fourth twenty fourteen => november 24 2014 +nineteen ninety four => 1994 +two thousand three => 2003 +one thousand eight => 1008 +nineteen seventy six => 1976 +june twentieth twenty fourteen => june 20 2014 +nineteen seventy three => 1973 +nineteen seventy five => 1975 +eleven fifty five => 1155 +July twenty fifth two thousand twelve => July 25 2012 +Nineteen eighties => 1980s +Two thousand and twenty => 2020 +Two thousand and nine => 2009 +The twenty fifth of july twenty twelve => 25 july 2012 +The twenty fifth of july two thousand twelve => 25 july 2012 +The twenty second of july twenty twelve => 22 july 2012 +The fifteenth of january => 15 january +The fifteenth of January => 15 January +The seventeenth of may twenty ten => 17 may 2010 +January first => January 1 +July twenty second two thousand eight => July 22 2008 +June thirty => June 30 +July twenty fifth twenty twelve => July 25 2012 +Nineteen seventeen => 1917 +Twenty twelve => 2012 +March sixteen sixty five => March 1665 +Sixteen sixty five => 1665 +July two thousand twelve => July 2012 +October nineteen oh five => October 1905 +July fifteen o six => July 1506 +The twenty fifth of july twenty twelve => 25 july 2012 +The twenty fifth of July twenty twelve => 25 July 2012 +July twenty fifth twenty twelve => July 25 2012 +July twenty fifth two thousand twelve => July 25 2012 +July one thousand eight hundred seventy six => July 1876 +February twenty fifth twenty sixteen => February 25 2016 +November twenty fourth twenty fourteen => November 24 2014 +Nineteen ninety four => 1994 +Two thousand three => 2003 +One thousand eight => 1008 +Nineteen seventy six => 1976 +June twentieth twenty fourteen => June 20 2014 +Nineteen seventy three => 1973 +Nineteen seventy five => 1975 +Eleven fifty five => 1155 diff --git a/itn/english/test/data/decimal.txt b/itn/english/test/data/decimal.txt index b9cc1ac7..46570c70 100644 --- a/itn/english/test/data/decimal.txt +++ b/itn/english/test/data/decimal.txt @@ -1,129 +1,129 @@ -five point two million~5.2 million -one hundred sixty four point five eight thousand~164.58 thousand -four hundred million~400 million -fifty billion~50 billion -four hundred five billion~405 billion -four point eight five billion~4.85 billion -one hundred billion~100 billion -one hundred ten billion~110 billion -one hundred thirty two billion~132 billion -one point eight four billion~1.84 billion -one point eight one billion~1.81 billion -one point five nine billion~1.59 billion -one point four five three billion~1.453 billion -one point seven two billion~1.72 billion -one point two five billion~1.25 billion -thirteen billion~13 billion -thirty billion~30 billion -two thousand eight hundred five point eight seven three billion~2805.873 billion -seventy trillion~70 trillion -thirteen million~13 million -eighteen billion~18 billion -four hundred fifty million~450 million -one hundred thirty million~130 million -ten million~10 million -four hundred million~400 million -five million~5 million -five hundred million~500 million -twelve million~12 million -thirteen million~13 million -four million~4 million -forty five million~45 million -fifteen million~15 million -fifteen trillion~15 trillion -fifteen billion~15 billion -two million~2 million -eight million~8 million -point one two o five~.1205 -minus sixty point two four zero zero~-60.2400 -zero point two six~0.26 -point zero two~.02 -sixty point two~60.2 -eighteen~18 -eighteen point eight five~18.85 -eighteen point five o~18.50 -eighteen point five six~18.56 -eighteen point nine~18.9 -eighteen point o five~18.05 -eighteen point one two~18.12 -eighteen point o one~18.01 -eighteen point o o o~18.000 -eighteen point six~18.6 -eighteen point three o o~18.300 -eighteen point three six~18.36 -eighteen point two five~18.25 -eighteen point two two~18.22 -eight hundred eighteen point three o three~818.303 -eight hundred eight point eight~808.8 -eight hundred eight point zero~808.0 -eight hundred eighty eight point one~888.1 -eight hundred eighty four point three~884.3 -eight hundred eighty two point eight~882.8 -eight hundred eighty two point zero~882.0 -eight hundred forty five point nine four~845.94 -Five point two million~5.2 million -One hundred sixty four point five eight thousand~164.58 thousand -Four hundred million~400 million -Fifty billion~50 billion -Four hundred five billion~405 billion -Four point eight five billion~4.85 billion -One hundred billion~100 billion -One hundred ten billion~110 billion -One hundred thirty two billion~132 billion -One point eight four billion~1.84 billion -One point eight four Billion~1.84 Billion -One point eight one billion~1.81 billion -One point five nine billion~1.59 billion -One point four five three billion~1.453 billion -One point seven two billion~1.72 billion -One point two five billion~1.25 billion -Thirteen billion~13 billion -Thirty billion~30 billion -Thirty Billion~30 Billion -Two thousand eight hundred five point eight seven three billion~2805.873 billion -Seventy trillion~70 trillion -Thirteen million~13 million -Eighteen billion~18 billion -Four hundred fifty million~450 million -One hundred thirty million~130 million -Ten million~10 million -Four hundred million~400 million -Five million~5 million -Five hundred million~500 million -Twelve million~12 million -Thirteen million~13 million -Four million~4 million -Forty five million~45 million -Fifteen million~15 million -Fifteen trillion~15 trillion -Fifteen billion~15 billion -Fifteen Billion~15 Billion -Two million~2 million -Eight million~8 million -Point one two o five~.1205 -Minus sixty point two four zero zero~-60.2400 -Zero point two six~0.26 -Point zero two~.02 -Sixty point two~60.2 -Eighteen~18 -Eighteen point eight five~18.85 -Eighteen point five o~18.50 -Eighteen point five six~18.56 -Eighteen point nine~18.9 -Eighteen point o five~18.05 -Eighteen point one two~18.12 -Eighteen point o one~18.01 -Eighteen point o o o~18.000 -Eighteen point six~18.6 -Eighteen point three o o~18.300 -Eighteen point three six~18.36 -Eighteen point two five~18.25 -Eighteen point two two~18.22 -Eight hundred eighteen point three o three~818.303 -Eight hundred eight point eight~808.8 -Eight hundred eight point zero~808.0 -Eight hundred eighty eight point one~888.1 -Eight hundred eighty four point three~884.3 -Eight hundred eighty two point eight~882.8 -Eight hundred eighty two point zero~882.0 -Eight hundred forty five point nine four~845.94 +five point two million => 5.2 million +one hundred sixty four point five eight thousand => 164.58 thousand +four hundred million => 400 million +fifty billion => 50 billion +four hundred five billion => 405 billion +four point eight five billion => 4.85 billion +one hundred billion => 100 billion +one hundred ten billion => 110 billion +one hundred thirty two billion => 132 billion +one point eight four billion => 1.84 billion +one point eight one billion => 1.81 billion +one point five nine billion => 1.59 billion +one point four five three billion => 1.453 billion +one point seven two billion => 1.72 billion +one point two five billion => 1.25 billion +thirteen billion => 13 billion +thirty billion => 30 billion +two thousand eight hundred five point eight seven three billion => 2805.873 billion +seventy trillion => 70 trillion +thirteen million => 13 million +eighteen billion => 18 billion +four hundred fifty million => 450 million +one hundred thirty million => 130 million +ten million => 10 million +four hundred million => 400 million +five million => 5 million +five hundred million => 500 million +twelve million => 12 million +thirteen million => 13 million +four million => 4 million +forty five million => 45 million +fifteen million => 15 million +fifteen trillion => 15 trillion +fifteen billion => 15 billion +two million => 2 million +eight million => 8 million +point one two o five => .1205 +minus sixty point two four zero zero => -60.2400 +zero point two six => 0.26 +point zero two => .02 +sixty point two => 60.2 +eighteen => 18 +eighteen point eight five => 18.85 +eighteen point five o => 18.50 +eighteen point five six => 18.56 +eighteen point nine => 18.9 +eighteen point o five => 18.05 +eighteen point one two => 18.12 +eighteen point o one => 18.01 +eighteen point o o o => 18.000 +eighteen point six => 18.6 +eighteen point three o o => 18.300 +eighteen point three six => 18.36 +eighteen point two five => 18.25 +eighteen point two two => 18.22 +eight hundred eighteen point three o three => 818.303 +eight hundred eight point eight => 808.8 +eight hundred eight point zero => 808.0 +eight hundred eighty eight point one => 888.1 +eight hundred eighty four point three => 884.3 +eight hundred eighty two point eight => 882.8 +eight hundred eighty two point zero => 882.0 +eight hundred forty five point nine four => 845.94 +Five point two million => 5.2 million +One hundred sixty four point five eight thousand => 164.58 thousand +Four hundred million => 400 million +Fifty billion => 50 billion +Four hundred five billion => 405 billion +Four point eight five billion => 4.85 billion +One hundred billion => 100 billion +One hundred ten billion => 110 billion +One hundred thirty two billion => 132 billion +One point eight four billion => 1.84 billion +One point eight four Billion => 1.84 Billion +One point eight one billion => 1.81 billion +One point five nine billion => 1.59 billion +One point four five three billion => 1.453 billion +One point seven two billion => 1.72 billion +One point two five billion => 1.25 billion +Thirteen billion => 13 billion +Thirty billion => 30 billion +Thirty Billion => 30 Billion +Two thousand eight hundred five point eight seven three billion => 2805.873 billion +Seventy trillion => 70 trillion +Thirteen million => 13 million +Eighteen billion => 18 billion +Four hundred fifty million => 450 million +One hundred thirty million => 130 million +Ten million => 10 million +Four hundred million => 400 million +Five million => 5 million +Five hundred million => 500 million +Twelve million => 12 million +Thirteen million => 13 million +Four million => 4 million +Forty five million => 45 million +Fifteen million => 15 million +Fifteen trillion => 15 trillion +Fifteen billion => 15 billion +Fifteen Billion => 15 Billion +Two million => 2 million +Eight million => 8 million +Point one two o five => .1205 +Minus sixty point two four zero zero => -60.2400 +Zero point two six => 0.26 +Point zero two => .02 +Sixty point two => 60.2 +Eighteen => 18 +Eighteen point eight five => 18.85 +Eighteen point five o => 18.50 +Eighteen point five six => 18.56 +Eighteen point nine => 18.9 +Eighteen point o five => 18.05 +Eighteen point one two => 18.12 +Eighteen point o one => 18.01 +Eighteen point o o o => 18.000 +Eighteen point six => 18.6 +Eighteen point three o o => 18.300 +Eighteen point three six => 18.36 +Eighteen point two five => 18.25 +Eighteen point two two => 18.22 +Eight hundred eighteen point three o three => 818.303 +Eight hundred eight point eight => 808.8 +Eight hundred eight point zero => 808.0 +Eight hundred eighty eight point one => 888.1 +Eight hundred eighty four point three => 884.3 +Eight hundred eighty two point eight => 882.8 +Eight hundred eighty two point zero => 882.0 +Eight hundred forty five point nine four => 845.94 diff --git a/itn/english/test/data/electronic.txt b/itn/english/test/data/electronic.txt index 07df3f04..556e8be0 100644 --- a/itn/english/test/data/electronic.txt +++ b/itn/english/test/data/electronic.txt @@ -1,59 +1,59 @@ -a dot b c at g mail dot com~a.bc@gmail.com -a at gmail dot com~a@gmail.com -a at m s n dot fr~a@msn.fr -a at a o l dot com~a@aol.com -a at m s n dot com~a@msn.com -a at nvidia dot com~a@nvidia.com -a dot b c at nvidia dot com~a.bc@nvidia.com -c d f at a b c dot e d u~cdf@abc.edu -a b c at g mail dot a b c~abc@gmail.abc -a b c at a b c dot com~abc@abc.com -a s d f one two three at a b c dot com~asdf123@abc.com -a one b two at a b c dot com~a1b2@abc.com -a b three dot s d d dot three at g mail dot com~ab3.sdd.3@gmail.com -dot three at g mail dot com~dot 3@gmail.com -one three at g mail dot com~13@gmail.com -a b three hyphen s d d dash three at g mail dot com~ab3-sdd-3@gmail.com -h t t p colon slash slash w w w dot o u r d a i l y n e w s dot com dot s m~http://www.ourdailynews.com.sm -h t t p colon slash slash w w w dot c o m d a i l y n e w s dot a b dot s m~http://www.comdailynews.ab.sm -h t t p colon slash slash w w w dot c o m d a i l y n e w s dot a b slash s m~http://www.comdailynews.ab/sm -w w w dot c o m d a i l y n e w s dot a b slash s m~www.comdailynews.ab/sm -c o m d a i l y n e w s dot a b slash s m~comdailynews.ab/sm -n vidia dot com~nvidia.com -abc at gmail dot com~abc@gmail.com -athreed at gmail dot com~athreed@gmail.com -kore dot ai~kore.ai -A dot b c at g mail dot com~A.bc@gmail.com -A at gmail dot com~A@gmail.com -A at m s n dot fr~A@msn.fr -A at a o l dot com~A@aol.com -A at m s n dot com~A@msn.com -A at nvidia dot com~A@nvidia.com -A dot b c at nvidia dot com~A.bc@nvidia.com -C d f at a b c dot e d u~Cdf@abc.edu -A b c at g mail dot a b c~Abc@gmail.abc -A b c at a b c dot com~Abc@abc.com -A s d f one two three at a b c dot com~Asdf123@abc.com -A one b two at a b c dot com~A1b2@abc.com -A b three dot s d d dot three at g mail dot com~Ab3.sdd.3@gmail.com -Dot three at g mail dot com~Dot 3@gmail.com -One three at g mail dot com~13@gmail.com -A b three hyphen s d d dash three at g mail dot com~Ab3-sdd-3@gmail.com -H t t p colon slash slash w w w dot o u r d a i l y n e w s dot com dot s m~http://www.ourdailynews.com.sm -HTTP colon slash slash w w w dot o u r d a i l y n e w s dot com dot s m~http://www.ourdailynews.com.sm -HTTP colon slash slash w w w dot o u r d a i l y n e w s dot com dot Zero one~http://www.ourdailynews.com.01 -http colon slash slash w w w dot o u r d a i l y n e w s dot com dot SM and s ~http://www.ourdailynews.com.SM and s -Http colon slash slash w w w dot o u r d a i l y n e w s dot COM dot s m~http://www.ourdailynews.COM.sm -H t t p colon slash slash w w w dot c o m d a i l y n e w s dot a b dot s m~http://www.comdailynews.ab.sm -H t t p colon slash slash w w w dot c o m d a i l y n e w s dot a b slash s m~http://www.comdailynews.ab/sm -W w w dot c o m d a i l y n e w s dot a b slash s m~www.comdailynews.ab/sm -C o m d a i l y n e w s dot a b slash s m~Comdailynews.ab/sm -N vidia dot com~nvidia.com -Abc at gmail dot com~Abc@gmail.com -Athreed at gmail dot com~Athreed@gmail.com -Kore dot ai~Kore.ai -NVIDIA dot com~NVIDIA.com -NVIDIA dot COM~NVIDIA.COM -WWW.A B C at A B C dot com~WWW.A BC@ABC.com -W W W. A B C dot com~www.ABC.com -w w w . o u r d a i l y n e w s dot com . s m~www.ourdailynews.com . s m +a dot b c at g mail dot com => a.bc@gmail.com +a at gmail dot com => a@gmail.com +a at m s n dot fr => a@msn.fr +a at a o l dot com => a@aol.com +a at m s n dot com => a@msn.com +a at nvidia dot com => a@nvidia.com +a dot b c at nvidia dot com => a.bc@nvidia.com +c d f at a b c dot e d u => cdf@abc.edu +a b c at g mail dot a b c => abc@gmail.abc +a b c at a b c dot com => abc@abc.com +a s d f one two three at a b c dot com => asdf123@abc.com +a one b two at a b c dot com => a1b2@abc.com +a b three dot s d d dot three at g mail dot com => ab3.sdd.3@gmail.com +dot three at g mail dot com => dot 3@gmail.com +one three at g mail dot com => 13@gmail.com +a b three hyphen s d d dash three at g mail dot com => ab3-sdd-3@gmail.com +h t t p colon slash slash w w w dot o u r d a i l y n e w s dot com dot s m => http://www.ourdailynews.com.sm +h t t p colon slash slash w w w dot c o m d a i l y n e w s dot a b dot s m => http://www.comdailynews.ab.sm +h t t p colon slash slash w w w dot c o m d a i l y n e w s dot a b slash s m => http://www.comdailynews.ab/sm +w w w dot c o m d a i l y n e w s dot a b slash s m => www.comdailynews.ab/sm +c o m d a i l y n e w s dot a b slash s m => comdailynews.ab/sm +n vidia dot com => nvidia.com +abc at gmail dot com => abc@gmail.com +athreed at gmail dot com => athreed@gmail.com +kore dot ai => kore.ai +A dot b c at g mail dot com => A.bc@gmail.com +A at gmail dot com => A@gmail.com +A at m s n dot fr => A@msn.fr +A at a o l dot com => A@aol.com +A at m s n dot com => A@msn.com +A at nvidia dot com => A@nvidia.com +A dot b c at nvidia dot com => A.bc@nvidia.com +C d f at a b c dot e d u => Cdf@abc.edu +A b c at g mail dot a b c => Abc@gmail.abc +A b c at a b c dot com => Abc@abc.com +A s d f one two three at a b c dot com => Asdf123@abc.com +A one b two at a b c dot com => A1b2@abc.com +A b three dot s d d dot three at g mail dot com => Ab3.sdd.3@gmail.com +Dot three at g mail dot com => Dot 3@gmail.com +One three at g mail dot com => 13@gmail.com +A b three hyphen s d d dash three at g mail dot com => Ab3-sdd-3@gmail.com +H t t p colon slash slash w w w dot o u r d a i l y n e w s dot com dot s m => http://www.ourdailynews.com.sm +HTTP colon slash slash w w w dot o u r d a i l y n e w s dot com dot s m => http://www.ourdailynews.com.sm +HTTP colon slash slash w w w dot o u r d a i l y n e w s dot com dot Zero one => http://www.ourdailynews.com.01 +http colon slash slash w w w dot o u r d a i l y n e w s dot com dot SM and s => http://www.ourdailynews.com.SM and s +Http colon slash slash w w w dot o u r d a i l y n e w s dot COM dot s m => http://www.ourdailynews.COM.sm +H t t p colon slash slash w w w dot c o m d a i l y n e w s dot a b dot s m => http://www.comdailynews.ab.sm +H t t p colon slash slash w w w dot c o m d a i l y n e w s dot a b slash s m => http://www.comdailynews.ab/sm +W w w dot c o m d a i l y n e w s dot a b slash s m => www.comdailynews.ab/sm +C o m d a i l y n e w s dot a b slash s m => Comdailynews.ab/sm +N vidia dot com => nvidia.com +Abc at gmail dot com => Abc@gmail.com +Athreed at gmail dot com => Athreed@gmail.com +Kore dot ai => Kore.ai +NVIDIA dot com => NVIDIA.com +NVIDIA dot COM => NVIDIA.COM +WWW.A B C at A B C dot com => WWW.A BC@ABC.com +W W W. A B C dot com => www.ABC.com +w w w . o u r d a i l y n e w s dot com . s m => www.ourdailynews.com . s m diff --git a/itn/english/test/data/measure.txt b/itn/english/test/data/measure.txt index 1cf435d0..64fa9785 100644 --- a/itn/english/test/data/measure.txt +++ b/itn/english/test/data/measure.txt @@ -1,232 +1,232 @@ -two hundred meters~200 m -fifty six point three per square kilometer~56.3 /km² -two hundred kilometers per hour~200 km/h -twenty eight kilograms force per square centimeter~28 kgf/cm² -forty two thousand two hundred fifty nine per square meter~42259 /m² -minus two thousand twelve kilo liters~-2012 kl -minus sixty six kilograms~-66 kg -two kilo watt hours~2 kWh -one point o o o o two eight cubic deci meters~1.000028 dm³ -seven point five peta bytes~7.5 pb -three hours~3 h -one milli volt~1 mv -two cubic meters~2 m³ -ninety grams~90 g -one hundred twenty four point three lumens~124.3 lm -four hundred forty milliliters~440 ml -thirty one thousand four hundred eighty square feet~31480 sq ft -one thousand six hundred hours~1600 h -thirty one thousand four hundred eighty square feet~31480 sq ft -two square miles~2 sq mi -zero point one nine square miles~0.19 sq mi -one thousand five hundred thirty one c c~1531 cc -three hundred micrometers~300 μm -sixty five thousand square kilometers~65000 km² -two miles per hour~2 mph -two hundred forty five miles per hour~245 mph -one hundred fifty c c~150 cc -sixty point two four zero zero kilograms~60.2400 kg -zero feet~0 ft -zero foot~0 ft -two feet~2 ft -twenty foot~20 ft -point two meters~.2 m -two square meters~2 m² -eighteen feet~18 ft -eighteen mega siemens~18 ms -eighteen ounces~18 oz -eighteen point five kilometers~18.5 km -eighteen point five two square kilometers~18.52 km² -eighteen point nine one square kilometers~18.91 km² -eighteen point one four percent~18.14 % -eighteen point one six percent~18.16 % -eighteen point one square kilometers~18.1 km² -eighteen point six percent~18.6 % -eighteen point two two kilometers~18.22 km -eighteen point zero kilometers~18.0 km -eighteen point zero percent~18.0 % -eighteen square kilometers~18 km² -eighteen thousand eight hundred giga watt hours~18800 gWh -eighteen thousand seven hundred hectares~18700 ha -eight hectares~8 ha -eight hundred eighty five astronomical units~885 au -eight hundred eighty hectares~880 ha -eight hundred eighty kilobytes~880 kb -eight hundred eighty kilometers~880 km -eight hundred eighty nine feet~889 ft -eight hundred eighty six kilometers~886 km -eight hundred eighty two megawatts~882 mW -eight hundred feet~800 ft -eight hundred fifty five square kilometers~855 km² -eight hundred fifty megahertz~850 mhz -eight hundred fifty meters~850 m -eight hundred fifty nanometers~850 nm -eight hundred fifty one meters~851 m -eight hundred fifty seven square kilometers~857 km² -eight hundred fifty three meters~853 m -eight hundred fifty three point six meters~853.6 m -eight hundred five point four six square kilometers~805.46 km² -eight hundred forty two point nine meters~842.9 m -eight hundred forty two square kilometers~842 km² -eight hundred gigabytes~800 gb -eight hundred horsepower~800 hp -eight hundred kilograms~800 kg -eight hundred kilo watt hours~800 kWh -eight hundred kilowatts~800 kW -eight hundred megahertz~800 mhz -eight hundred ninety four c c~894 cc -eight hundred ninety kilowatts~890 kW -eight hundred ninety millimeters~890 mm -eight hundred ninety two square kilometers~892 km² -eight hundred seventy horsepower~870 hp -eight hundred seventy meters~870 m -eight hundred sixty kilograms~860 kg -eight hundred sixty kilometers~860 km -eight hundred sixty miles~860 mi -eight hundred sixty six feet~866 ft -eight hundred ten hectares~810 ha -eight hundred ten kilohertz~810 khz -eight hundred thirty eight point two millimeters~838.2 mm -eight hundred thirty five kilometers~835 km -eight hundred thirty kilohertz~830 khz -eight hundred thirty megawatts~830 mW -eight hundred thirty nine kilometers~839 km -eight hundred thirty six meters~836 m -eight hundred twenty feet~820 ft -eight hundred twenty kilometers~820 km -eight hundred twenty meters~820 m -eight hundred twenty one point zero feet~821.0 ft -eight hundred two point eight nine kilometers~802.89 km -eight hundred volts~800 v -eight kilobits~8 kb -eight kilograms~8 kg -eight million two hundred thousand feet~8200000 ft -eight point eight kilometers~8.8 km -eight point eight meters~8.8 m -eight point eight miles~8.8 mi -eight point five centimeters~8.5 cm -eight point five five percent~8.55 % -eight point five megawatts~8.5 mW -eight point five meters~8.5 m -eight point five two percent~8.52 % -eight point four four percent~8.44 % -Two hundred meters~200 m -Fifty six point three per square kilometer~56.3 /km² -Two hundred kilometers per hour~200 km/h -Twenty eight kilograms force per square centimeter~28 kgf/cm² -Forty two thousand two hundred fifty nine per square meter~42259 /m² -Minus two thousand twelve kilo liters~-2012 kl -Minus sixty six kilograms~-66 kg -Two kilo watt hours~2 kWh -One point o o o o two eight cubic deci meters~1.000028 dm³ -Seven point five peta bytes~7.5 pb -Three hours~3 h -One milli volt~1 mv -Two cubic meters~2 m³ -Ninety grams~90 g -One hundred twenty four point three lumens~124.3 lm -Four hundred forty milliliters~440 ml -Thirty one thousand four hundred eighty square feet~31480 sq ft -One thousand six hundred hours~1600 h -Thirty one thousand four hundred eighty square feet~31480 sq ft -Two square miles~2 sq mi -Zero point one nine square miles~0.19 sq mi -One thousand five hundred thirty one c c~1531 cc -Three hundred micrometers~300 μm -Sixty five thousand square kilometers~65000 km² -Two miles per hour~2 mph -Two hundred forty five miles per hour~245 mph -One hundred fifty c c~150 cc -Sixty point two four zero zero kilograms~60.2400 kg -Zero feet~0 ft -Zero foot~0 ft -Two feet~2 ft -Twenty foot~20 ft -Point two meters~.2 m -Two square meters~2 m² -Eighteen feet~18 ft -Eighteen mega siemens~18 ms -Eighteen ounces~18 oz -Eighteen point five kilometers~18.5 km -Eighteen point five two square kilometers~18.52 km² -Eighteen point nine one square kilometers~18.91 km² -Eighteen point one four percent~18.14 % -Eighteen point one six percent~18.16 % -Eighteen point one square kilometers~18.1 km² -Eighteen point six percent~18.6 % -Eighteen point two two kilometers~18.22 km -Eighteen point zero kilometers~18.0 km -Eighteen point zero percent~18.0 % -Eighteen square kilometers~18 km² -Eighteen thousand eight hundred giga watt hours~18800 gWh -Eighteen thousand seven hundred hectares~18700 ha -Eight hectares~8 ha -Eight hundred eighty five astronomical units~885 au -Eight hundred eighty hectares~880 ha -Eight hundred eighty kilobytes~880 kb -Eight hundred eighty kilometers~880 km -Eight hundred eighty nine feet~889 ft -Eight hundred eighty six kilometers~886 km -Eight hundred eighty two megawatts~882 mW -Eight hundred feet~800 ft -Eight hundred fifty five square kilometers~855 km² -Eight hundred fifty megahertz~850 mhz -Eight hundred fifty meters~850 m -Eight hundred fifty nanometers~850 nm -Eight hundred fifty one meters~851 m -Eight hundred fifty seven square kilometers~857 km² -Eight hundred fifty three meters~853 m -Eight hundred fifty three point six meters~853.6 m -Eight hundred five point four six square kilometers~805.46 km² -Eight hundred forty two point nine meters~842.9 m -Eight hundred forty two square kilometers~842 km² -Eight hundred gigabytes~800 gb -Eight hundred horsepower~800 hp -Eight hundred kilograms~800 kg -Eight hundred kilo watt hours~800 kWh -Eight hundred kilowatts~800 kW -Eight hundred megahertz~800 mhz -Eight hundred ninety four c c~894 cc -Eight hundred ninety kilowatts~890 kW -Eight hundred ninety millimeters~890 mm -Eight hundred ninety two square kilometers~892 km² -Eight hundred seventy horsepower~870 hp -Eight hundred seventy meters~870 m -Eight hundred sixty kilograms~860 kg -Eight hundred sixty kilometers~860 km -Eight hundred sixty miles~860 mi -Eight hundred sixty six feet~866 ft -Eight hundred ten hectares~810 ha -Eight hundred ten kilohertz~810 khz -Eight hundred thirty eight point two millimeters~838.2 mm -Eight hundred thirty five kilometers~835 km -Eight hundred thirty kilohertz~830 khz -Eight hundred thirty megawatts~830 mW -Eight hundred thirty nine kilometers~839 km -Eight hundred thirty six meters~836 m -Eight hundred twenty feet~820 ft -Eight hundred twenty kilometers~820 km -Eight hundred twenty meters~820 m -Eight hundred twenty one point zero feet~821.0 ft -Eight hundred two point eight nine kilometers~802.89 km -Eight hundred volts~800 v -Eight kilobits~8 kb -Eight KiloBits~8 kb -Eight kilograms~8 kg -Eight million two hundred thousand feet~8200000 ft -Eight point eight kilometers~8.8 km -Eight point eight meters~8.8 m -Eight point eight miles~8.8 mi -Eight point five centimeters~8.5 cm -Eight point five five percent~8.55 % -Eight point five megawatts~8.5 mW -Eight point five meters~8.5 m -Eight point five two percent~8.52 % -Eight point four four percent~8.44 % -one gigabit per second~1 gbps -nine gigabits per second~9 gbps -five degrees celsius~5 °C -seventy two degrees fahrenheit~72 °F -Seventy two Degrees Fahrenheit~72 °F -two hundred seventy three kelvin~273 K -Nine GigaBits per second~9 gbps +two hundred meters => 200 m +fifty six point three per square kilometer => 56.3 /km² +two hundred kilometers per hour => 200 km/h +twenty eight kilograms force per square centimeter => 28 kgf/cm² +forty two thousand two hundred fifty nine per square meter => 42259 /m² +minus two thousand twelve kilo liters => -2012 kl +minus sixty six kilograms => -66 kg +two kilo watt hours => 2 kWh +one point o o o o two eight cubic deci meters => 1.000028 dm³ +seven point five peta bytes => 7.5 pb +three hours => 3 h +one milli volt => 1 mv +two cubic meters => 2 m³ +ninety grams => 90 g +one hundred twenty four point three lumens => 124.3 lm +four hundred forty milliliters => 440 ml +thirty one thousand four hundred eighty square feet => 31480 sq ft +one thousand six hundred hours => 1600 h +thirty one thousand four hundred eighty square feet => 31480 sq ft +two square miles => 2 sq mi +zero point one nine square miles => 0.19 sq mi +one thousand five hundred thirty one c c => 1531 cc +three hundred micrometers => 300 μm +sixty five thousand square kilometers => 65000 km² +two miles per hour => 2 mph +two hundred forty five miles per hour => 245 mph +one hundred fifty c c => 150 cc +sixty point two four zero zero kilograms => 60.2400 kg +zero feet => 0 ft +zero foot => 0 ft +two feet => 2 ft +twenty foot => 20 ft +point two meters => .2 m +two square meters => 2 m² +eighteen feet => 18 ft +eighteen mega siemens => 18 ms +eighteen ounces => 18 oz +eighteen point five kilometers => 18.5 km +eighteen point five two square kilometers => 18.52 km² +eighteen point nine one square kilometers => 18.91 km² +eighteen point one four percent => 18.14 % +eighteen point one six percent => 18.16 % +eighteen point one square kilometers => 18.1 km² +eighteen point six percent => 18.6 % +eighteen point two two kilometers => 18.22 km +eighteen point zero kilometers => 18.0 km +eighteen point zero percent => 18.0 % +eighteen square kilometers => 18 km² +eighteen thousand eight hundred giga watt hours => 18800 gWh +eighteen thousand seven hundred hectares => 18700 ha +eight hectares => 8 ha +eight hundred eighty five astronomical units => 885 au +eight hundred eighty hectares => 880 ha +eight hundred eighty kilobytes => 880 kb +eight hundred eighty kilometers => 880 km +eight hundred eighty nine feet => 889 ft +eight hundred eighty six kilometers => 886 km +eight hundred eighty two megawatts => 882 mW +eight hundred feet => 800 ft +eight hundred fifty five square kilometers => 855 km² +eight hundred fifty megahertz => 850 mhz +eight hundred fifty meters => 850 m +eight hundred fifty nanometers => 850 nm +eight hundred fifty one meters => 851 m +eight hundred fifty seven square kilometers => 857 km² +eight hundred fifty three meters => 853 m +eight hundred fifty three point six meters => 853.6 m +eight hundred five point four six square kilometers => 805.46 km² +eight hundred forty two point nine meters => 842.9 m +eight hundred forty two square kilometers => 842 km² +eight hundred gigabytes => 800 gb +eight hundred horsepower => 800 hp +eight hundred kilograms => 800 kg +eight hundred kilo watt hours => 800 kWh +eight hundred kilowatts => 800 kW +eight hundred megahertz => 800 mhz +eight hundred ninety four c c => 894 cc +eight hundred ninety kilowatts => 890 kW +eight hundred ninety millimeters => 890 mm +eight hundred ninety two square kilometers => 892 km² +eight hundred seventy horsepower => 870 hp +eight hundred seventy meters => 870 m +eight hundred sixty kilograms => 860 kg +eight hundred sixty kilometers => 860 km +eight hundred sixty miles => 860 mi +eight hundred sixty six feet => 866 ft +eight hundred ten hectares => 810 ha +eight hundred ten kilohertz => 810 khz +eight hundred thirty eight point two millimeters => 838.2 mm +eight hundred thirty five kilometers => 835 km +eight hundred thirty kilohertz => 830 khz +eight hundred thirty megawatts => 830 mW +eight hundred thirty nine kilometers => 839 km +eight hundred thirty six meters => 836 m +eight hundred twenty feet => 820 ft +eight hundred twenty kilometers => 820 km +eight hundred twenty meters => 820 m +eight hundred twenty one point zero feet => 821.0 ft +eight hundred two point eight nine kilometers => 802.89 km +eight hundred volts => 800 v +eight kilobits => 8 kb +eight kilograms => 8 kg +eight million two hundred thousand feet => 8200000 ft +eight point eight kilometers => 8.8 km +eight point eight meters => 8.8 m +eight point eight miles => 8.8 mi +eight point five centimeters => 8.5 cm +eight point five five percent => 8.55 % +eight point five megawatts => 8.5 mW +eight point five meters => 8.5 m +eight point five two percent => 8.52 % +eight point four four percent => 8.44 % +Two hundred meters => 200 m +Fifty six point three per square kilometer => 56.3 /km² +Two hundred kilometers per hour => 200 km/h +Twenty eight kilograms force per square centimeter => 28 kgf/cm² +Forty two thousand two hundred fifty nine per square meter => 42259 /m² +Minus two thousand twelve kilo liters => -2012 kl +Minus sixty six kilograms => -66 kg +Two kilo watt hours => 2 kWh +One point o o o o two eight cubic deci meters => 1.000028 dm³ +Seven point five peta bytes => 7.5 pb +Three hours => 3 h +One milli volt => 1 mv +Two cubic meters => 2 m³ +Ninety grams => 90 g +One hundred twenty four point three lumens => 124.3 lm +Four hundred forty milliliters => 440 ml +Thirty one thousand four hundred eighty square feet => 31480 sq ft +One thousand six hundred hours => 1600 h +Thirty one thousand four hundred eighty square feet => 31480 sq ft +Two square miles => 2 sq mi +Zero point one nine square miles => 0.19 sq mi +One thousand five hundred thirty one c c => 1531 cc +Three hundred micrometers => 300 μm +Sixty five thousand square kilometers => 65000 km² +Two miles per hour => 2 mph +Two hundred forty five miles per hour => 245 mph +One hundred fifty c c => 150 cc +Sixty point two four zero zero kilograms => 60.2400 kg +Zero feet => 0 ft +Zero foot => 0 ft +Two feet => 2 ft +Twenty foot => 20 ft +Point two meters => .2 m +Two square meters => 2 m² +Eighteen feet => 18 ft +Eighteen mega siemens => 18 ms +Eighteen ounces => 18 oz +Eighteen point five kilometers => 18.5 km +Eighteen point five two square kilometers => 18.52 km² +Eighteen point nine one square kilometers => 18.91 km² +Eighteen point one four percent => 18.14 % +Eighteen point one six percent => 18.16 % +Eighteen point one square kilometers => 18.1 km² +Eighteen point six percent => 18.6 % +Eighteen point two two kilometers => 18.22 km +Eighteen point zero kilometers => 18.0 km +Eighteen point zero percent => 18.0 % +Eighteen square kilometers => 18 km² +Eighteen thousand eight hundred giga watt hours => 18800 gWh +Eighteen thousand seven hundred hectares => 18700 ha +Eight hectares => 8 ha +Eight hundred eighty five astronomical units => 885 au +Eight hundred eighty hectares => 880 ha +Eight hundred eighty kilobytes => 880 kb +Eight hundred eighty kilometers => 880 km +Eight hundred eighty nine feet => 889 ft +Eight hundred eighty six kilometers => 886 km +Eight hundred eighty two megawatts => 882 mW +Eight hundred feet => 800 ft +Eight hundred fifty five square kilometers => 855 km² +Eight hundred fifty megahertz => 850 mhz +Eight hundred fifty meters => 850 m +Eight hundred fifty nanometers => 850 nm +Eight hundred fifty one meters => 851 m +Eight hundred fifty seven square kilometers => 857 km² +Eight hundred fifty three meters => 853 m +Eight hundred fifty three point six meters => 853.6 m +Eight hundred five point four six square kilometers => 805.46 km² +Eight hundred forty two point nine meters => 842.9 m +Eight hundred forty two square kilometers => 842 km² +Eight hundred gigabytes => 800 gb +Eight hundred horsepower => 800 hp +Eight hundred kilograms => 800 kg +Eight hundred kilo watt hours => 800 kWh +Eight hundred kilowatts => 800 kW +Eight hundred megahertz => 800 mhz +Eight hundred ninety four c c => 894 cc +Eight hundred ninety kilowatts => 890 kW +Eight hundred ninety millimeters => 890 mm +Eight hundred ninety two square kilometers => 892 km² +Eight hundred seventy horsepower => 870 hp +Eight hundred seventy meters => 870 m +Eight hundred sixty kilograms => 860 kg +Eight hundred sixty kilometers => 860 km +Eight hundred sixty miles => 860 mi +Eight hundred sixty six feet => 866 ft +Eight hundred ten hectares => 810 ha +Eight hundred ten kilohertz => 810 khz +Eight hundred thirty eight point two millimeters => 838.2 mm +Eight hundred thirty five kilometers => 835 km +Eight hundred thirty kilohertz => 830 khz +Eight hundred thirty megawatts => 830 mW +Eight hundred thirty nine kilometers => 839 km +Eight hundred thirty six meters => 836 m +Eight hundred twenty feet => 820 ft +Eight hundred twenty kilometers => 820 km +Eight hundred twenty meters => 820 m +Eight hundred twenty one point zero feet => 821.0 ft +Eight hundred two point eight nine kilometers => 802.89 km +Eight hundred volts => 800 v +Eight kilobits => 8 kb +Eight KiloBits => 8 kb +Eight kilograms => 8 kg +Eight million two hundred thousand feet => 8200000 ft +Eight point eight kilometers => 8.8 km +Eight point eight meters => 8.8 m +Eight point eight miles => 8.8 mi +Eight point five centimeters => 8.5 cm +Eight point five five percent => 8.55 % +Eight point five megawatts => 8.5 mW +Eight point five meters => 8.5 m +Eight point five two percent => 8.52 % +Eight point four four percent => 8.44 % +one gigabit per second => 1 gbps +nine gigabits per second => 9 gbps +five degrees celsius => 5 °C +seventy two degrees fahrenheit => 72 °F +Seventy two Degrees Fahrenheit => 72 °F +two hundred seventy three kelvin => 273 K +Nine GigaBits per second => 9 gbps diff --git a/itn/english/test/data/money.txt b/itn/english/test/data/money.txt index 9a40405a..9729c148 100644 --- a/itn/english/test/data/money.txt +++ b/itn/english/test/data/money.txt @@ -1,54 +1,54 @@ -Two dollars~$2 -One cent~$0.01 -Four united states dollars and sixty nine cents~$4.69 -Seventy five dollars sixty three~$75.63 -Twenty nine dollars fifty cents~$29.50 -Eleven dollars and fifty one cents~$11.51 -Nine hundred ninety three dollars and ninety two cents~$993.92 -Four hundred sixty billion won~₩460 billion -Thirty billion yen~¥30 billion -Two point five billion dollars~$2.5 billion -Forty five billion dollars~$45 billion -Fifty million dollars~$50 million -Fifty billion dollars~$50 billion -Zero point two million dollars~$0.2 million -Fifteen point two billion dollars~$15.2 billion -One point six nine billion yuan~1.69 billion yuan -One point four three six billion yuan~1.436 billion yuan -Four million yuan~4 million yuan -One dollar~$1 -Fifteen thousand dollars~$15000 -One dollars~One dollars -Twenty dollar~$20 -Twenty point five o six dollars~$20.506 -Point five o six dollars~$.506 -Eighteen dollars~$18 -Eighteen million nine hundred twenty five thousand dollars~$18925000 -Eighteen thousand eight hundred fifty four dollars~$18854 -Eighteen thousand eight hundred one dollars~$18801 -Eighteen thousand eight hundred seventy five dollars~$18875 -Eighteen thousand eighty one dollars~$18081 -Eighteen thousand fifty two dollars~$18052 -Eighteen thousand five hundred forty two dollars~$18542 -Eighteen thousand five hundred nineteen dollars~$18519 -Eighteen thousand five hundred seventy dollars~$18570 -Eighteen thousand five hundred seventy eight dollars~$18578 -Eighteen thousand five hundred sixteen dollars~$18516 -Eighteen thousand four hundred eighty two dollars~$18482 -Eighteen thousand four hundred seventy eight dollars~$18478 -Eighteen thousand four hundred sixty eight dollars~$18468 -Eighteen thousand nine hundred three dollars~$18903 -Eighteen thousand nine hundred twenty nine dollars~$18929 -Eighteen thousand ninety five dollars~$18095 -Eighteen thousand one hundred seventeen dollars~$18117 -Eighteen thousand one hundred twenty eight dollars~$18128 -Eighteen thousand one hundred twenty five dollars~$18125 -Eighteen thousand one hundred twenty four dollars~$18124 -Eighteen thousand one hundred twenty nine dollars~$18129 -One thousand fifty five dollars~$1055 -One fifty five dollars~$155 -Fifteen hundred dollars~$1500 -Ninety nine hundred dollars~$9900 -Ninety nine hundred and fifteen dollars and one cent~$9915.01 -zero point two Million Dollars~$0.2 Million -One Cent~$0.01 +Two dollars => $2 +One cent => $0.01 +Four united states dollars and sixty nine cents => $4.69 +Seventy five dollars sixty three => $75.63 +Twenty nine dollars fifty cents => $29.50 +Eleven dollars and fifty one cents => $11.51 +Nine hundred ninety three dollars and ninety two cents => $993.92 +Four hundred sixty billion won => ₩460 billion +Thirty billion yen => ¥30 billion +Two point five billion dollars => $2.5 billion +Forty five billion dollars => $45 billion +Fifty million dollars => $50 million +Fifty billion dollars => $50 billion +Zero point two million dollars => $0.2 million +Fifteen point two billion dollars => $15.2 billion +One point six nine billion yuan => 1.69 billion yuan +One point four three six billion yuan => 1.436 billion yuan +Four million yuan => 4 million yuan +One dollar => $1 +Fifteen thousand dollars => $15000 +One dollars => One dollars +Twenty dollar => $20 +Twenty point five o six dollars => $20.506 +Point five o six dollars => $.506 +Eighteen dollars => $18 +Eighteen million nine hundred twenty five thousand dollars => $18925000 +Eighteen thousand eight hundred fifty four dollars => $18854 +Eighteen thousand eight hundred one dollars => $18801 +Eighteen thousand eight hundred seventy five dollars => $18875 +Eighteen thousand eighty one dollars => $18081 +Eighteen thousand fifty two dollars => $18052 +Eighteen thousand five hundred forty two dollars => $18542 +Eighteen thousand five hundred nineteen dollars => $18519 +Eighteen thousand five hundred seventy dollars => $18570 +Eighteen thousand five hundred seventy eight dollars => $18578 +Eighteen thousand five hundred sixteen dollars => $18516 +Eighteen thousand four hundred eighty two dollars => $18482 +Eighteen thousand four hundred seventy eight dollars => $18478 +Eighteen thousand four hundred sixty eight dollars => $18468 +Eighteen thousand nine hundred three dollars => $18903 +Eighteen thousand nine hundred twenty nine dollars => $18929 +Eighteen thousand ninety five dollars => $18095 +Eighteen thousand one hundred seventeen dollars => $18117 +Eighteen thousand one hundred twenty eight dollars => $18128 +Eighteen thousand one hundred twenty five dollars => $18125 +Eighteen thousand one hundred twenty four dollars => $18124 +Eighteen thousand one hundred twenty nine dollars => $18129 +One thousand fifty five dollars => $1055 +One fifty five dollars => $155 +Fifteen hundred dollars => $1500 +Ninety nine hundred dollars => $9900 +Ninety nine hundred and fifteen dollars and one cent => $9915.01 +zero point two Million Dollars => $0.2 Million +One Cent => $0.01 diff --git a/itn/english/test/data/ordinal.txt b/itn/english/test/data/ordinal.txt index 28b5af81..04a867fe 100644 --- a/itn/english/test/data/ordinal.txt +++ b/itn/english/test/data/ordinal.txt @@ -1,34 +1,34 @@ -One hundredth~100th -Twenty five thousand one hundred eleventh~25111th -Second~2nd -Zeroth~0th -First~1st -Second~2nd -Third~3rd -Fourth~4th -Eleventh~11th -Twelfth~12th -Thirteenth~13th -Twenty first~21st -Twenty third~23rd -One hundred eleventh~111th -One thousandth~1000th -One hundred twenty first~121st -Eleven hundred twenty first~1121st -Second~2nd -Tenth~10th -Sixth~6th -Third~3rd -Nineteenth~19th -Third~3rd -Twelfth~12th -Forty eighth~48th -Seventy first~71st -Third~3rd -Forty second~42nd -Seventeenth~17th -Twentieth~20th -Twenty first~21st -Seventh~7th -Second~2nd -Fifth~5th +One hundredth => 100th +Twenty five thousand one hundred eleventh => 25111th +Second => 2nd +Zeroth => 0th +First => 1st +Second => 2nd +Third => 3rd +Fourth => 4th +Eleventh => 11th +Twelfth => 12th +Thirteenth => 13th +Twenty first => 21st +Twenty third => 23rd +One hundred eleventh => 111th +One thousandth => 1000th +One hundred twenty first => 121st +Eleven hundred twenty first => 1121st +Second => 2nd +Tenth => 10th +Sixth => 6th +Third => 3rd +Nineteenth => 19th +Third => 3rd +Twelfth => 12th +Forty eighth => 48th +Seventy first => 71st +Third => 3rd +Forty second => 42nd +Seventeenth => 17th +Twentieth => 20th +Twenty first => 21st +Seventh => 7th +Second => 2nd +Fifth => 5th diff --git a/itn/english/test/data/telephone.txt b/itn/english/test/data/telephone.txt index cfb2e73d..c2b03d01 100644 --- a/itn/english/test/data/telephone.txt +++ b/itn/english/test/data/telephone.txt @@ -1,18 +1,18 @@ -One two three one two three five six seven eight~123-123-5678 -Plus nine one one two three one two three five six seven eight~+91 123-123-5678 -Plus forty four one two three one two three five six seven eight~+44 123-123-5678 -Four one two three one two three five six seven eight~4 123-123-5678 -Zero two three one two three five six seven eight~023-123-5678 -O two three one two three five six seven eight~023-123-5678 -Oh two three one two three five six seven eight~023-123-5678 -Double oh three one two three five six seven eight~003-123-5678 -Four three two double seven three two one four three two one four three double zero five~432 7732 143214 3005 -One two three dot one two three dot o dot four o~123.123.0.40 -One twenty three dot one two three dot o dot four o~123.123.0.40 -Two two five dot double five dot o dot four o~225.55.0.40 -Two two five dot double five dot o dot forty five~225.55.0.45 -SSN is seven double nine one two three double one three~SSN is 799-12-3113 -Seven nine nine~799 -A b nine~Ab9 -A b c~A b c -Five w k r a three one~5wkra31 +One two three one two three five six seven eight => 123-123-5678 +Plus nine one one two three one two three five six seven eight => +91 123-123-5678 +Plus forty four one two three one two three five six seven eight => +44 123-123-5678 +Four one two three one two three five six seven eight => 4 123-123-5678 +Zero two three one two three five six seven eight => 023-123-5678 +O two three one two three five six seven eight => 023-123-5678 +Oh two three one two three five six seven eight => 023-123-5678 +Double oh three one two three five six seven eight => 003-123-5678 +Four three two double seven three two one four three two one four three double zero five => 432 7732 143214 3005 +One two three dot one two three dot o dot four o => 123.123.0.40 +One twenty three dot one two three dot o dot four o => 123.123.0.40 +Two two five dot double five dot o dot four o => 225.55.0.40 +Two two five dot double five dot o dot forty five => 225.55.0.45 +SSN is seven double nine one two three double one three => SSN is 799-12-3113 +Seven nine nine => 799 +A b nine => Ab9 +A b c => A b c +Five w k r a three one => 5wkra31 diff --git a/itn/english/test/data/time.txt b/itn/english/test/data/time.txt index ac76b36b..86603aca 100644 --- a/itn/english/test/data/time.txt +++ b/itn/english/test/data/time.txt @@ -1,33 +1,33 @@ -Eight oclock g m t~08:00 gmt -Seven a m e s t~07:00 a.m. est -Two p m~02:00 p.m. -Seven A M E S T~07:00 A.M. EST -Two P M~02:00 P.M. -Two thirty~02:30 -Three o'clock~03:00 -Quarter past one~01:15 -Half past three~03:30 -Eight fifty one~08:51 -Eight fifty two~08:52 -Eight forty~08:40 -Eight nineteen~08:19 -Eight o six~08:06 -Eight thirty eight~08:38 -Eight thirty two~08:32 -Eight twenty nine~08:29 -Eleven fifty five p m~11:55 p.m. -Eleven fifty three p m~11:53 p.m. -Eleven forty a m~11:40 a.m. -Eleven forty five a m~11:45 a.m. -Eleven forty p m~11:40 p.m. -Eleven forty six a m~11:46 a.m. -Eleven o six p m~11:06 p.m. -Eleven thirteen a m~11:13 a.m. -Half past twelve~12:30 -Quarter past one~01:15 -Quarter to one~12:45 -Quarter to twelve~11:45 -Set alarm at ten to eleven pm~Set alarm at 10:50 p.m. -One min to one am~12:59 a.m. -eleven Forty six A M~11:46 A.M. -eleven forty six AM~11:46 A.M. +Eight oclock g m t => 08:00 gmt +Seven a m e s t => 07:00 a.m. est +Two p m => 02:00 p.m. +Seven A M E S T => 07:00 A.M. EST +Two P M => 02:00 P.M. +Two thirty => 02:30 +Three o'clock => 03:00 +Quarter past one => 01:15 +Half past three => 03:30 +Eight fifty one => 08:51 +Eight fifty two => 08:52 +Eight forty => 08:40 +Eight nineteen => 08:19 +Eight o six => 08:06 +Eight thirty eight => 08:38 +Eight thirty two => 08:32 +Eight twenty nine => 08:29 +Eleven fifty five p m => 11:55 p.m. +Eleven fifty three p m => 11:53 p.m. +Eleven forty a m => 11:40 a.m. +Eleven forty five a m => 11:45 a.m. +Eleven forty p m => 11:40 p.m. +Eleven forty six a m => 11:46 a.m. +Eleven o six p m => 11:06 p.m. +Eleven thirteen a m => 11:13 a.m. +Half past twelve => 12:30 +Quarter past one => 01:15 +Quarter to one => 12:45 +Quarter to twelve => 11:45 +Set alarm at ten to eleven pm => Set alarm at 10:50 p.m. +One min to one am => 12:59 a.m. +eleven Forty six A M => 11:46 A.M. +eleven forty six AM => 11:46 A.M. diff --git a/itn/english/test/data/whitelist.txt b/itn/english/test/data/whitelist.txt index 684c321d..7415241c 100644 --- a/itn/english/test/data/whitelist.txt +++ b/itn/english/test/data/whitelist.txt @@ -1,9 +1,9 @@ -Doctor dao~Dr. dao -Misses Smith~Mrs. Smith -Misses smith~Mrs. smith -Mister Dao~Mr. Dao -Saint George~St. George -I like for example ice cream~I like e.g. ice cream -S and p five hundred~S&P 500 -S and P Five hundred~S&P 500 -Seven eleven stores~7-eleven stores +Doctor dao => Dr. dao +Misses Smith => Mrs. Smith +Misses smith => Mrs. smith +Mister Dao => Mr. Dao +Saint George => St. George +I like for example ice cream => I like e.g. ice cream +S and p five hundred => S&P 500 +S and P Five hundred => S&P 500 +Seven eleven stores => 7-eleven stores diff --git a/itn/english/test/data/word.txt b/itn/english/test/data/word.txt index 3d02d716..3a8d9657 100644 --- a/itn/english/test/data/word.txt +++ b/itn/english/test/data/word.txt @@ -1,49 +1,49 @@ -E s three~Es3 -Yahoo!~Yahoo! -Twenty!~20 ! -X ~X -—~— -Aaa~Aaa -Aabach~Aabach -Aabenraa~Aabenraa -Aabye~Aabye -Aaccessed~Aaccessed -Aach~Aach -Aachen's~Aachen's -Aadri~Aadri -Aafia~Aafia -Aagaard~Aagaard -Aagadu~Aagadu -Aagard~Aagard -Aagathadi~Aagathadi -Aaghart's~Aaghart's -Aagnes~Aagnes -Aagomoni~Aagomoni -Aagon~Aagon -Aagoo~Aagoo -Aagot~Aagot -Aahar~Aahar -Aahh~Aahh -Aahperd~Aahperd -Aaibinterstate~Aaibinterstate -Aajab~Aajab -Aakasa~Aakasa -Aakervik~Aakervik -Aakirkeby~Aakirkeby -Aalam~Aalam -Aalbaek~Aalbaek -Aaldiu~Aaldiu -Aalem~Aalem -A'ali~A'ali -Aalilaassamthey~Aalilaassamthey -Aalin~Aalin -Aaliyan~Aaliyan -Aaliyan's~Aaliyan's -Aamadu~Aamadu -Aamara~Aamara -Aambala~Aambala -Aamera~Aamera -Aamer's~Aamer's -Aamina~Aamina -Aaminah~Aaminah -Aamjiwnaang~Aamjiwnaang +E s three => Es3 +Yahoo! => Yahoo! +Twenty! => 20 ! +X => X +— => — +Aaa => Aaa +Aabach => Aabach +Aabenraa => Aabenraa +Aabye => Aabye +Aaccessed => Aaccessed +Aach => Aach +Aachen's => Aachen's +Aadri => Aadri +Aafia => Aafia +Aagaard => Aagaard +Aagadu => Aagadu +Aagard => Aagard +Aagathadi => Aagathadi +Aaghart's => Aaghart's +Aagnes => Aagnes +Aagomoni => Aagomoni +Aagon => Aagon +Aagoo => Aagoo +Aagot => Aagot +Aahar => Aahar +Aahh => Aahh +Aahperd => Aahperd +Aaibinterstate => Aaibinterstate +Aajab => Aajab +Aakasa => Aakasa +Aakervik => Aakervik +Aakirkeby => Aakirkeby +Aalam => Aalam +Aalbaek => Aalbaek +Aaldiu => Aaldiu +Aalem => Aalem +A'ali => A'ali +Aalilaassamthey => Aalilaassamthey +Aalin => Aalin +Aaliyan => Aaliyan +Aaliyan's => Aaliyan's +Aamadu => Aamadu +Aamara => Aamara +Aambala => Aambala +Aamera => Aamera +Aamer's => Aamer's +Aamina => Aamina +Aaminah => Aaminah +Aamjiwnaang => Aamjiwnaang diff --git a/itn/english/test/normalizer_test.py b/itn/english/test/normalizer_test.py new file mode 100644 index 00000000..38b787e9 --- /dev/null +++ b/itn/english/test/normalizer_test.py @@ -0,0 +1,36 @@ +# Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from itertools import chain + +import pytest + +from itn.english.inverse_normalizer import InverseNormalizer +from itn.english.test.utils import parse_test_case + + +class TestNormalizer: + + normalizer = InverseNormalizer(overwrite_cache=True) + + normalizer_cases = chain( + parse_test_case("data/cardinal.txt"), + # parse_test_case("data/normalizer.txt"), + ) + + @pytest.mark.parametrize("spoken, written", normalizer_cases) + def test_normalizer(self, spoken, written): + print(spoken, written) + print(self.normalizer.tag(spoken)) + assert self.normalizer.normalize(spoken) == written diff --git a/itn/english/test/utils.py b/itn/english/test/utils.py new file mode 120000 index 00000000..a2ade7bc --- /dev/null +++ b/itn/english/test/utils.py @@ -0,0 +1 @@ +../../../tn/chinese/test/utils.py \ No newline at end of file From ad82d030504a9253168f490938680f95186ea755 Mon Sep 17 00:00:00 2001 From: pengzhendong <275331498@qq.com> Date: Mon, 1 Dec 2025 14:04:27 +0800 Subject: [PATCH 3/4] add ordinal --- itn/english/inverse_normalizer.py | 7 ++- itn/english/rules/ordinal.py | 68 +++++++++++++++++++++++++++++ itn/english/test/normalizer_test.py | 3 +- 3 files changed, 75 insertions(+), 3 deletions(-) create mode 100644 itn/english/rules/ordinal.py diff --git a/itn/english/inverse_normalizer.py b/itn/english/inverse_normalizer.py index 126cddbc..ed3ee4c2 100644 --- a/itn/english/inverse_normalizer.py +++ b/itn/english/inverse_normalizer.py @@ -16,6 +16,7 @@ from pynini.lib.pynutil import add_weight, delete from itn.english.rules.cardinal import Cardinal +from itn.english.rules.ordinal import Ordinal from itn.english.rules.word import Word from tn.processor import Processor @@ -29,12 +30,14 @@ def __init__(self, cache_dir=None, overwrite_cache=False): self.build_fst("en_itn", cache_dir, overwrite_cache) def build_tagger(self): - tagger = (add_weight(Cardinal().tagger, 1.0) | add_weight(Word().tagger, 100)).optimize() + tagger = ( + add_weight(Cardinal().tagger, 1.0) | add_weight(Ordinal().tagger, 1.09) | add_weight(Word().tagger, 100) + ).optimize() tagger = tagger.star # remove the last space self.tagger = tagger @ self.build_rule(delete(" "), "", "[EOS]") def build_verbalizer(self): - verbalizer = (Cardinal().verbalizer | Word().verbalizer).optimize() + verbalizer = (Cardinal().verbalizer | Ordinal().verbalizer | Word().verbalizer).optimize() self.verbalizer = verbalizer.star diff --git a/itn/english/rules/ordinal.py b/itn/english/rules/ordinal.py new file mode 100644 index 00000000..537a586a --- /dev/null +++ b/itn/english/rules/ordinal.py @@ -0,0 +1,68 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pynini import closure, cross, string_file, union +from pynini.lib.pynutil import insert + +from itn.english.rules.cardinal import Cardinal +from tn.processor import Processor +from tn.utils import get_abs_path + + +class Ordinal(Processor): + """ + Finite state transducer for classifying ordinal + e.g. thirteenth -> ordinal { integer: "13" } + + Args: + cardinal: CardinalFst + input_case: accepting either "lower_cased" or "cased" input. + """ + + def __init__(self): + super().__init__("ordinal") + self.build_tagger() + self.build_verbalizer() + + def build_tagger(self): + graph_digit = string_file(get_abs_path("../itn/english/data/ordinals/digit.tsv")) + graph_teens = string_file(get_abs_path("../itn/english/data/ordinals/teen.tsv")) + graph = closure(self.CHAR) + union(graph_digit, graph_teens, cross("tieth", "ty"), cross("th", "")) + + self.graph = graph @ Cardinal().graph_no_exception + self.graph |= ((self.TO_LOWER + self.SIGMA) @ self.graph).optimize() + + convert_eleven = cross("11", "11th") + convert_twelve = cross("12", "12th") + convert_thirteen = cross("13", "13th") + convert_one = cross("1", "1st") + convert_two = cross("2", "2nd") + convert_three = cross("3", "3rd") + convert_rest = insert("th", weight=0.01) + suffix = self.build_rule( + convert_eleven + | convert_twelve + | convert_thirteen + | convert_one + | convert_two + | convert_three + | convert_rest, + "", + "[EOS]", + ) + self.graph = self.graph @ suffix + + tagger = insert('value: "') + self.graph + insert('"') + self.tagger = self.add_tokens(tagger).optimize() diff --git a/itn/english/test/normalizer_test.py b/itn/english/test/normalizer_test.py index 38b787e9..01c7d6bf 100644 --- a/itn/english/test/normalizer_test.py +++ b/itn/english/test/normalizer_test.py @@ -26,7 +26,8 @@ class TestNormalizer: normalizer_cases = chain( parse_test_case("data/cardinal.txt"), - # parse_test_case("data/normalizer.txt"), + parse_test_case("data/ordinal.txt"), + # parse_test_case("data/word.txt"), ) @pytest.mark.parametrize("spoken, written", normalizer_cases) From 5d25a96964a18f21bd5fd275235b76f7d3104ab7 Mon Sep 17 00:00:00 2001 From: pengzhendong <275331498@qq.com> Date: Mon, 1 Dec 2025 15:29:52 +0800 Subject: [PATCH 4/4] add date --- itn/english/data/months_cased.tsv | 12 -- itn/english/inverse_normalizer.py | 10 +- itn/english/rules/date.py | 183 ++++++++++++++++++++++++++++ itn/english/rules/ordinal.py | 12 +- itn/english/rules/word.py | 1 - itn/english/test/data/debug.txt | 10 ++ itn/english/test/normalizer_test.py | 9 +- tn/token_parser.py | 5 + 8 files changed, 218 insertions(+), 24 deletions(-) delete mode 100644 itn/english/data/months_cased.tsv create mode 100644 itn/english/rules/date.py create mode 100644 itn/english/test/data/debug.txt diff --git a/itn/english/data/months_cased.tsv b/itn/english/data/months_cased.tsv deleted file mode 100644 index 750d7937..00000000 --- a/itn/english/data/months_cased.tsv +++ /dev/null @@ -1,12 +0,0 @@ -January -February -March -April -May -June -July -August -September -October -November -December diff --git a/itn/english/inverse_normalizer.py b/itn/english/inverse_normalizer.py index ed3ee4c2..cf27ced4 100644 --- a/itn/english/inverse_normalizer.py +++ b/itn/english/inverse_normalizer.py @@ -16,6 +16,7 @@ from pynini.lib.pynutil import add_weight, delete from itn.english.rules.cardinal import Cardinal +from itn.english.rules.date import Date from itn.english.rules.ordinal import Ordinal from itn.english.rules.word import Word from tn.processor import Processor @@ -24,14 +25,17 @@ class InverseNormalizer(Processor): def __init__(self, cache_dir=None, overwrite_cache=False): - super().__init__(name="en_inverse_normalizer", ordertype="itn") + super().__init__(name="en_inverse_normalizer", ordertype="en_itn") if cache_dir is None: cache_dir = files("itn") self.build_fst("en_itn", cache_dir, overwrite_cache) def build_tagger(self): tagger = ( - add_weight(Cardinal().tagger, 1.0) | add_weight(Ordinal().tagger, 1.09) | add_weight(Word().tagger, 100) + add_weight(Cardinal().tagger, 1.0) + | add_weight(Ordinal().tagger, 1.09) + | add_weight(Date().tagger, 1.09) + | add_weight(Word().tagger, 100) ).optimize() tagger = tagger.star @@ -39,5 +43,5 @@ def build_tagger(self): self.tagger = tagger @ self.build_rule(delete(" "), "", "[EOS]") def build_verbalizer(self): - verbalizer = (Cardinal().verbalizer | Ordinal().verbalizer | Word().verbalizer).optimize() + verbalizer = (Cardinal().verbalizer | Date().verbalizer | Ordinal().verbalizer | Word().verbalizer).optimize() self.verbalizer = verbalizer.star diff --git a/itn/english/rules/date.py b/itn/english/rules/date.py new file mode 100644 index 00000000..f34f5e86 --- /dev/null +++ b/itn/english/rules/date.py @@ -0,0 +1,183 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pynini import accep, closure, cross, string_file, union +from pynini.lib.pynutil import add_weight, delete, insert + +from itn.english.rules.ordinal import Ordinal +from tn.processor import Processor +from tn.utils import get_abs_path + + +class Date(Processor): + """ + Finite state transducer for classifying date, + e.g. january fifth twenty twelve -> date { month: "january" day: "5" year: "2012" preserve_order: true } + e.g. the fifth of january twenty twelve -> date { day: "5" month: "january" year: "2012" preserve_order: true } + e.g. twenty twenty -> date { year: "2012" preserve_order: true } + + Args: + ordinal: OrdinalFst + input_case: accepting either "lower_cased" or "cased" input. + """ + + def __init__(self): + super().__init__("date") + self.build_tagger() + self.build_verbalizer() + + def build_tagger(self): + graph_teen = string_file(get_abs_path("../itn/english/data/numbers/teen.tsv")).optimize() + graph_digit = string_file(get_abs_path("../itn/english/data/numbers/digit.tsv")).optimize() + ties_graph = string_file(get_abs_path("../itn/english/data/numbers/ties.tsv")).optimize() + month_graph = string_file(get_abs_path("../itn/english/data/months.tsv")) + financial_period_graph = string_file(get_abs_path("../itn/english/data/date_period.tsv")).invert() + graph_ad_bc = self.DELETE_SPACE + string_file(get_abs_path("../itn/english/data/year_suffix.tsv")).invert() + + def _get_ties_graph(): + """ + Transducer for 20-99 e.g + twenty three -> 23 + """ + graph = ties_graph + (self.DELETE_SPACE + self.DIGIT | insert("0")) + graph |= ((self.TO_LOWER + self.SIGMA) @ graph).optimize() + return graph + + def _get_range_graph(): + """ + Transducer for decades (1**0s, 2**0s), centuries (2*00s, 1*00s), millennia (2000s) + """ + graph_ties = _get_ties_graph() + graph = (graph_ties | graph_teen) + self.DELETE_SPACE + cross("hundreds", "00s") + graph |= cross("two", "2") + self.DELETE_SPACE + cross("thousands", "000s") + graph |= ( + (graph_ties | graph_teen) + + self.DELETE_SPACE + + (closure(self.ALPHA, 1) + (cross("ies", "y") | delete("s"))) @ (graph_ties | cross("ten", "10")) + + insert("s") + ) + graph @= union("1", "2") + self.DIGIT + self.DIGIT + self.DIGIT + "s" + graph |= ((self.TO_LOWER + self.SIGMA) @ graph).optimize() + return graph + + def _get_year_graph(): + """ + Transducer for year, e.g. twenty twenty -> 2020 + """ + + def _get_digits_graph(): + zero = cross((accep("oh") | accep("o")), "0") + graph = zero + self.DELETE_SPACE + graph_digit + graph |= ((self.TO_LOWER + self.SIGMA) @ graph).optimize() + return graph + + def _get_thousands_graph(): + graph_ties = _get_ties_graph() + graph_hundred_component = (graph_digit + self.DELETE_SPACE + delete("hundred")) | insert("0") + optional_end = closure(delete("and "), 0, 1) + graph = ( + graph_digit + + self.DELETE_SPACE + + delete("thousand") + + self.DELETE_SPACE + + graph_hundred_component + + self.DELETE_SPACE + + (graph_teen | graph_ties | (optional_end + insert("0") + graph_digit)) + ) + graph |= ((self.TO_LOWER + self.SIGMA) @ graph).optimize() + return graph + + graph_ties = _get_ties_graph() + graph_digits = _get_digits_graph() + graph_thousands = _get_thousands_graph() + + year_graph = ( + # 20 19, 40 12, 2012 - assuming no limit on the year + (graph_teen + self.DELETE_SPACE + (graph_ties | graph_digits | graph_teen)) + | (graph_ties + self.DELETE_SPACE + (graph_ties | graph_digits | graph_teen)) + | graph_thousands + | ((graph_digit + self.DELETE_SPACE + (graph_ties | graph_digits | graph_teen)) + graph_ad_bc) + | ( + (graph_digit | graph_teen | graph_digits | graph_ties | graph_thousands) + + self.DELETE_SPACE + + graph_ad_bc + ) + | ( + (graph_ties + self.DELETE_SPACE + (graph_ties | graph_digits | graph_teen)) + + self.DELETE_SPACE + + graph_ad_bc + ) + | ( + ( + (graph_teen | graph_digit) + + self.DELETE_SPACE + + delete("hundred") + + insert("0") + + (graph_digit | insert("0")) + ) + + self.DELETE_SPACE + + graph_ad_bc + ) + ) + year_graph.optimize() + year_graph |= ((self.TO_LOWER + self.SIGMA) @ year_graph).optimize() + return year_graph + + year_graph = _get_year_graph() + YEAR_WEIGHT = 0.001 + year_graph = add_weight(year_graph, YEAR_WEIGHT) + month_graph = insert('month: "') + month_graph + insert('"') + + day_graph = insert('day: "') + add_weight(Ordinal().graph, -0.7) + insert('"') + graph_year = self.DELETE_EXTRA_SPACE + insert('year: "') + add_weight(year_graph, -YEAR_WEIGHT) + insert('"') + optional_graph_year = closure(graph_year, 0, 1) + graph_mdy = month_graph + ( + (self.DELETE_EXTRA_SPACE + day_graph) | graph_year | (self.DELETE_EXTRA_SPACE + day_graph + graph_year) + ) + the_graph = delete("the") + the_graph |= delete("The").optimize() + + graph_dmy = ( + the_graph + + self.DELETE_SPACE + + day_graph + + self.DELETE_SPACE + + delete("of") + + self.DELETE_EXTRA_SPACE + + month_graph + + optional_graph_year + ) + + period_fy = insert('text: "') + financial_period_graph + (cross(" ", "") | cross(" of ", "")) + insert('"') + graph_year = insert('year: "') + (year_graph | _get_range_graph()) + insert('"') + graph_fy = period_fy + insert(" ") + graph_year + tagger = graph_mdy | graph_dmy | graph_year | graph_fy + tagger += insert(" preserve_order: \"true\"") + self.tagger = self.add_tokens(tagger).optimize() + + def build_verbalizer(self): + month = delete('month: "') + closure(self.NOT_QUOTE, 1) + delete('"') + day = delete('day: "') + closure(self.NOT_QUOTE, 1) + delete('"') + year = delete('year: "') + closure(self.NOT_QUOTE, 1) + delete('"') + period = delete('text: "') + closure(self.NOT_QUOTE, 1) + delete('"') + graph_fy = period + closure(self.DELETE_EXTRA_SPACE + year, 0, 1) + # month (day) year + graph_mdy = month + closure(self.DELETE_EXTRA_SPACE + day, 0, 1) + closure(self.DELETE_EXTRA_SPACE + year, 0, 1) + # (day) month year + graph_dmy = closure(day + self.DELETE_EXTRA_SPACE, 0, 1) + month + closure(self.DELETE_EXTRA_SPACE + year, 0, 1) + + verbalizer = graph_mdy | year | graph_dmy | graph_fy + verbalizer = (graph_mdy | year | graph_dmy | graph_fy) + delete(" preserve_order: \"true\"") + self.verbalizer = self.delete_tokens(verbalizer).optimize() diff --git a/itn/english/rules/ordinal.py b/itn/english/rules/ordinal.py index 537a586a..9f8c922e 100644 --- a/itn/english/rules/ordinal.py +++ b/itn/english/rules/ordinal.py @@ -44,6 +44,11 @@ def build_tagger(self): self.graph = graph @ Cardinal().graph_no_exception self.graph |= ((self.TO_LOWER + self.SIGMA) @ self.graph).optimize() + tagger = insert('value: "') + self.graph + insert('"') + self.tagger = self.add_tokens(tagger).optimize() + + def build_verbalizer(self): + super().build_verbalizer() convert_eleven = cross("11", "11th") convert_twelve = cross("12", "12th") convert_thirteen = cross("13", "13th") @@ -51,7 +56,7 @@ def build_tagger(self): convert_two = cross("2", "2nd") convert_three = cross("3", "3rd") convert_rest = insert("th", weight=0.01) - suffix = self.build_rule( + suffix = self.build_rule( convert_eleven | convert_twelve | convert_thirteen @@ -62,7 +67,4 @@ def build_tagger(self): "", "[EOS]", ) - self.graph = self.graph @ suffix - - tagger = insert('value: "') + self.graph + insert('"') - self.tagger = self.add_tokens(tagger).optimize() + self.verbalizer = (self.verbalizer @ suffix) .optimize() diff --git a/itn/english/rules/word.py b/itn/english/rules/word.py index 1503b209..4cb4bf9f 100644 --- a/itn/english/rules/word.py +++ b/itn/english/rules/word.py @@ -13,7 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -from pynini import closure from pynini.lib.pynutil import insert from tn.processor import Processor diff --git a/itn/english/test/data/debug.txt b/itn/english/test/data/debug.txt new file mode 100644 index 00000000..190ac529 --- /dev/null +++ b/itn/english/test/data/debug.txt @@ -0,0 +1,10 @@ +july twenty fifth two thousand twelve => july 25 2012 +nineteen eighties => 1980s +two thousand and twenty => 2020 +two thousand and nine => 2009 +the twenty fifth of july twenty twelve => 25 july 2012 +the twenty fifth of july two thousand twelve => 25 july 2012 +the twenty second of july twenty twelve => 22 july 2012 +the fifteenth of january => 15 january +the seventeenth of may twenty ten => 17 may 2010 +january first => january 1 diff --git a/itn/english/test/normalizer_test.py b/itn/english/test/normalizer_test.py index 01c7d6bf..4eaaa44a 100644 --- a/itn/english/test/normalizer_test.py +++ b/itn/english/test/normalizer_test.py @@ -25,13 +25,16 @@ class TestNormalizer: normalizer = InverseNormalizer(overwrite_cache=True) normalizer_cases = chain( - parse_test_case("data/cardinal.txt"), - parse_test_case("data/ordinal.txt"), + # parse_test_case("data/cardinal.txt"), + # parse_test_case("data/ordinal.txt"), + # parse_test_case("data/date.txt"), + parse_test_case("data/debug.txt"), # parse_test_case("data/word.txt"), ) @pytest.mark.parametrize("spoken, written", normalizer_cases) def test_normalizer(self, spoken, written): - print(spoken, written) + # print(f"spoken: {spoken}, written: {written}") print(self.normalizer.tag(spoken)) + print(f"actual: {self.normalizer.normalize(spoken)}, expected: {written}") assert self.normalizer.normalize(spoken) == written diff --git a/tn/token_parser.py b/tn/token_parser.py index 38ea1844..3c195f1f 100644 --- a/tn/token_parser.py +++ b/tn/token_parser.py @@ -33,6 +33,9 @@ "money": ["currency", "value", "decimal"], "time": ["hour", "minute", "second", "noon"], } +EN_ITN_ORDERS = { + "date": ["month", "day", "year", "text"], +} class Token: @@ -68,6 +71,8 @@ def __init__(self, ordertype="tn"): self.orders = ITN_ORDERS elif ordertype == "en_tn": self.orders = EN_TN_ORDERS + elif ordertype == "en_itn": + self.orders = EN_ITN_ORDERS else: raise NotImplementedError()