diff --git a/.gitignore b/.gitignore index 3a1ec79..f74e50b 100644 --- a/.gitignore +++ b/.gitignore @@ -40,5 +40,5 @@ compile_commands.json WeTextProcessing.egg-info/ build/ dist/ -tn/*.far -itn/*.far +tn/*.fst +itn/*.fst diff --git a/itn/english/__init__.py b/itn/english/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/itn/english/data/currency.tsv b/itn/english/data/currency.tsv new file mode 100644 index 0000000..2f1fcc0 --- /dev/null +++ b/itn/english/data/currency.tsv @@ -0,0 +1,168 @@ +؋ afghan afghani +l albanian lek +دج algerian dinar +kz angolan kwanza +aed arab emirates dirham +$ argentine peso +֏ armenian dram +ƒ aruban florin +a$ australian dollar +₼ azerbaijani manat +b$ bahamian dollar +.د.ب bahraini dinar +৳ bangladeshi taka +$ barbadian dollar +br belarusian ruble +bz$ belize dollar +$ bermudian dollar +nu bhutanese ngultrum +₿ bitcoin +bs bolivian boliviano +km bosnia and herzegovina convertible mark +p botswana pula +r$ brazilian real +£ british pound +b$ brunei dollar +лв bulgarian lev +fbu burundian franc +៛ cambodian riel +ca$ canadian dollar +cve cape verde escudo +cve cape verdean escudo +ci$ cayman islands dollar +cfa cfa franc +$ chilean peso +元 chinese yuan +$ colombian peso +cf comorian franc +fc congolese franc +₡ costa rican colón +kn croatian kuna +cuc$ cuban peso +cyp cypriot pound +kč czech koruna +d dalasi +kr danish krone +fdj djiboutian franc +$ dollar +rd$ dominican peso +$ east caribbean dollar +e£ egyptian pound +aed emirati dirham +nkf eritrean nakfa +Ξ ethereum +br ethiopian birr +€ euro +fk£ falkland islands pound +fim finnish markka +₾ georgian lari +dm german mark +gh₵ ghanaian cedi +q guatemalan quetzal +fg guinean franc +g$ guyanese dollar +g haitian gourde +l honduran lempira +hk$ hong kong dollar +ft hungarian forint +₹ inr +íkr icelandic króna +₹ indian rupee +rp indonesian rupiah +﷼ iranian rial +ع.د iraqi dinar +₪ israeli new shekel +₪ israeli shekel +j$ jamaican dollar +¥ japanese yen +ينار jordanian dinar +₸ kazakhstani tenge +ksh kenyan shilling +ك kuwaiti dinar +k kyat +som kyrgyzstani som +₭ lao kip +ლარი lari +ل.ل lebanese pound +l lesotho loti +ld$ liberian dollar +ld libyan dinar +l lilangeni +ł litecoin +mop$ macanese pataca +ar malagasy ariary +k malawian kwacha +rm malaysian ringgit +rf maldivian rufiyaa +mro mauritanian ouguiya +₨ mauritian rupee +$ mexican peso +l moldovan leu +ɱ monero +₮ mongolian tögrög +dh moroccan dirham +mt mozambican metical +n$ namibian dollar +rs nepalese rupee +ƒ netherlands antillean guilder +nt$ new taiwan dollar +nz$ new zealand dollar +c$ nicaraguan córdoba +₦ nigerian naira +₩ north korean won +kr norwegian krone +ر.ع omani rial +um ouguiya +pkr pakistani rupee +b/. panamanian balboa +₲ paraguayan guaraní +s/. peruvian sol +₱ philippine peso +zł polish zloty +£ pounds sterling +ر.ق qatari riyal +£ quid +r real +xrp ripples +lei romanian leu +₹ rupee +₽ russian ruble +r₣ rwandan franc +shp saint helena pounds +db são tomé and príncipe dobra +sar saudi riyal +ден second macedonian denar +rsd serbian dinar +scr seychelles rupee +le sierra leonean leone +s$ singapore dollar +sh.so. somali shilling +tjs somoni +r south african rand +₩ south korean won +ss£ south sudanese pound +රු sri lankan rupee +sdg sudanese pound +sr$ surinamese dollar +kr swedish krona +chf swiss franc +£s syrian pound +৳ taka +tzs tanzanian shilling +₮ tether +฿ thai baht +tt$ trinidad and tobago dollar +د.ت tunisian dinar +₺ turkish lira +m turkmen new manat +ush ugandan shilling +₴ ukrainian hryvna +$ united states dollar +$u uruguayan peso +$ us dollar +som uzbekistan som +bs. venezuelan bolívar +₫ vietnamese đồng +₩ won +﷼ yemeni rial +¥ yen diff --git a/itn/english/data/date_period.tsv b/itn/english/data/date_period.tsv new file mode 100644 index 0000000..b53ef7d --- /dev/null +++ b/itn/english/data/date_period.tsv @@ -0,0 +1,10 @@ +Q1 first quarter +Q1 q one +Q2 second quarter +Q2 q two +Q3 third quarter +Q3 q three +Q4 fourth quarter +Q4 q four +H1 first half +H2 second half \ No newline at end of file diff --git a/itn/english/data/electronic/domain.tsv b/itn/english/data/electronic/domain.tsv new file mode 100644 index 0000000..207a14a --- /dev/null +++ b/itn/english/data/electronic/domain.tsv @@ -0,0 +1,10 @@ +com +uk +fr +net +br +in +ru +de +it +ai diff --git a/itn/english/data/electronic/server_name.tsv b/itn/english/data/electronic/server_name.tsv new file mode 100644 index 0000000..ca7f97b --- /dev/null +++ b/itn/english/data/electronic/server_name.tsv @@ -0,0 +1,17 @@ +g mail gmail +gmail +n vidia nvidia +nvidia +outlook +hotmail +yahoo +aol +gmx +msn +live +yandex +orange +wanadoo +web +comcast +google diff --git a/itn/english/data/electronic/symbols.tsv b/itn/english/data/electronic/symbols.tsv new file mode 100644 index 0000000..5be8da8 --- /dev/null +++ b/itn/english/data/electronic/symbols.tsv @@ -0,0 +1,22 @@ +. dot +- dash +- hyphen +_ underscore +! exclamation mark +# number sign +$ dollar sign +% percent sign +& ampersand +' quote +* asterisk ++ plus +/ slash += equal sign +? question mark +^ circumflex +` right single quote +{ left brace +| vertical bar +} right brace +~ tilde +, comma \ No newline at end of file diff --git a/itn/english/data/electronic/url_symbols.tsv b/itn/english/data/electronic/url_symbols.tsv new file mode 100644 index 0000000..2946fde --- /dev/null +++ b/itn/english/data/electronic/url_symbols.tsv @@ -0,0 +1,5 @@ +. dot +- dash +- hyphen +_ underscore +/ slash \ No newline at end of file diff --git a/itn/english/data/magnitudes.tsv b/itn/english/data/magnitudes.tsv new file mode 100644 index 0000000..3bfa484 --- /dev/null +++ b/itn/english/data/magnitudes.tsv @@ -0,0 +1,4 @@ +k thousand +m million +b billion +t trillion \ No newline at end of file diff --git a/itn/english/data/measurements.tsv b/itn/english/data/measurements.tsv new file mode 100644 index 0000000..894eacd --- /dev/null +++ b/itn/english/data/measurements.tsv @@ -0,0 +1,145 @@ +°F fahrenheit +°C celsius +°F degree fahrenheit +°C degree celsius +K kelvin +km kilometer +m meter +cm centimeter +mm millimeter +ha hectare +mi mile +m² square meter +km² square kilometer +ft foot +% percent +hz hertz +kW kilowatt +kW kilo watt +kWh kilo watt hour +kWh kilowatt hour +Wh watt hour +W watt +hp horsepower +hp horse power +mg milligram +kg kilogram +ghz gigahertz +khz kilohertz +mhz megahertz +v volt +h hour +mc mega coulomb +s second +nm nanometer +rpm revolution per minute +min minute +mA milli ampere +% per cent +m³ cubic meter +mph mile per hour +tW tera watt +mv milli volt +mW megawatt +mW mega watt +μm micrometer +" inch +cc c c +g gram +da dalton +atm atmosphere +ω ohm +db decibel +ps peta second +oz ounce +hl hecto liter +μg microgram +pg petagram +kb kilobit +mb megabit +gb gigabit +tb terabit +pb petabit +ev electron volt +mb megabyte +kb kilobyte +gb gigabyte +tb terabyte +pb peta byte +bps bit per second +kbps kilobit per second +mbps megabit per second +gbps gigabit per second +kbps kilo bit per second +mbps mega bit per second +mbps mega bit per second +tbps terabit per second +tbps tera bit per second +pbps petabit per second +pbps peta bit per second +kb/s kilobyte per second +kb/s kilo byte per second +mb/s megabyte per second +mb/s mega byte per second +gb/s gigabyte per second +gb/s giga byte per second +tb/s terabyte per second +tb/s tera byte per second +pb/s petabyte per second +pb/s peta byte per second +st stone +kl kilo liter +tj tera joule +kv kilo volt +mv mega volt +kn kilo newton +kn kilonewton +mm megameter +au astronomical unit +yd yard +rad radian +lm lumen +hs hecto second +mol mole +gpa giga pascal +ml milliliter +gW gigawatt +gW gigaWatt +A ampere +mA mili ampere +µA micro ampere +MA mega ampere +kt knot +kgf kilogram force +ng nano gram +ns nanosecond +ms mega siemens +bar bar +gl giga liter +μs microsecond +dA deci ampere +pa pascal +ds deci second +ms milli second +dm deci meter +dm³ cubic deci meter +amu atomic mass unit +mf mega farad +bq becquerel +pb petabit +mm² square millimeter +cm² square centimeter +sq mi square mile +sq ft square foot +kpa kilopascal +cd candela +tl tera liter +ms mega second +mpa megapascal +pm peta meter +gWh giga watt hour +kcal kilo calory +gy gray +sv sievert +cwt hundredweight +cc c c diff --git a/itn/english/data/months.tsv b/itn/english/data/months.tsv new file mode 100644 index 0000000..1c05ac0 --- /dev/null +++ b/itn/english/data/months.tsv @@ -0,0 +1,12 @@ +january +february +march +april +may +june +july +august +september +october +november +december diff --git a/itn/english/data/numbers/digit.tsv b/itn/english/data/numbers/digit.tsv new file mode 100644 index 0000000..fa32979 --- /dev/null +++ b/itn/english/data/numbers/digit.tsv @@ -0,0 +1,9 @@ +one 1 +two 2 +three 3 +four 4 +five 5 +six 6 +seven 7 +eight 8 +nine 9 \ No newline at end of file diff --git a/itn/english/data/numbers/hundred.tsv b/itn/english/data/numbers/hundred.tsv new file mode 100644 index 0000000..10e2b96 --- /dev/null +++ b/itn/english/data/numbers/hundred.tsv @@ -0,0 +1 @@ +hundred \ No newline at end of file diff --git a/itn/english/data/numbers/teen.tsv b/itn/english/data/numbers/teen.tsv new file mode 100644 index 0000000..8e60fa1 --- /dev/null +++ b/itn/english/data/numbers/teen.tsv @@ -0,0 +1,10 @@ +ten 10 +eleven 11 +twelve 12 +thirteen 13 +fourteen 14 +fifteen 15 +sixteen 16 +seventeen 17 +eighteen 18 +nineteen 19 \ No newline at end of file diff --git a/itn/english/data/numbers/thousands.tsv b/itn/english/data/numbers/thousands.tsv new file mode 100644 index 0000000..bf30542 --- /dev/null +++ b/itn/english/data/numbers/thousands.tsv @@ -0,0 +1,22 @@ +thousand +million +billion +trillion +quadrillion +quintillion +sextillion +septillion +octillion +nonillion +decillion +undecillion +duodecillion +tredecillion +quattuordecillion +quindecillion +sexdecillion +septendecillion +octodecillion +novemdecillion +vigintillion +centillion \ No newline at end of file diff --git a/itn/english/data/numbers/ties.tsv b/itn/english/data/numbers/ties.tsv new file mode 100644 index 0000000..d7ec82e --- /dev/null +++ b/itn/english/data/numbers/ties.tsv @@ -0,0 +1,9 @@ +twenty 2 +thirty 3 +forty 4 +fourty 4 +fifty 5 +sixty 6 +seventy 7 +eighty 8 +ninety 9 \ No newline at end of file diff --git a/itn/english/data/numbers/zero.tsv b/itn/english/data/numbers/zero.tsv new file mode 100644 index 0000000..c479272 --- /dev/null +++ b/itn/english/data/numbers/zero.tsv @@ -0,0 +1 @@ +zero 0 \ No newline at end of file diff --git a/itn/english/data/ordinals/digit.tsv b/itn/english/data/ordinals/digit.tsv new file mode 100644 index 0000000..4b7fc24 --- /dev/null +++ b/itn/english/data/ordinals/digit.tsv @@ -0,0 +1,9 @@ +first one +second two +third three +fourth four +fifth five +sixth sixth +seventh seven +eighth eight +ninth nine \ No newline at end of file diff --git a/itn/english/data/ordinals/teen.tsv b/itn/english/data/ordinals/teen.tsv new file mode 100644 index 0000000..496fefc --- /dev/null +++ b/itn/english/data/ordinals/teen.tsv @@ -0,0 +1 @@ +twelfth twelve \ No newline at end of file diff --git a/itn/english/data/time/minute_to.tsv b/itn/english/data/time/minute_to.tsv new file mode 100644 index 0000000..edab4d5 --- /dev/null +++ b/itn/english/data/time/minute_to.tsv @@ -0,0 +1,59 @@ +1 59 +2 58 +3 57 +4 56 +5 55 +6 54 +7 53 +8 52 +9 51 +10 50 +11 49 +12 48 +13 47 +14 46 +15 45 +16 44 +17 43 +18 42 +19 41 +20 40 +21 39 +22 38 +23 37 +24 36 +25 35 +26 34 +27 33 +28 32 +29 31 +30 30 +31 29 +32 28 +33 27 +34 26 +35 25 +36 24 +37 23 +38 22 +39 21 +40 20 +41 19 +42 18 +43 17 +44 16 +45 15 +46 14 +47 13 +48 12 +49 11 +50 10 +51 9 +52 8 +53 7 +54 6 +55 5 +56 4 +57 3 +58 2 +59 1 diff --git a/itn/english/data/time/time_suffix.tsv b/itn/english/data/time/time_suffix.tsv new file mode 100644 index 0000000..b39a9ef --- /dev/null +++ b/itn/english/data/time/time_suffix.tsv @@ -0,0 +1,8 @@ +p m p.m. +pm p.m. +p.m. +p.m p.m. +am a.m. +a.m. +a.m a.m. +a m a.m. \ No newline at end of file diff --git a/itn/english/data/time/time_suffix_cased.tsv b/itn/english/data/time/time_suffix_cased.tsv new file mode 100644 index 0000000..97a4cf5 --- /dev/null +++ b/itn/english/data/time/time_suffix_cased.tsv @@ -0,0 +1,8 @@ +P M P.M. +PM P.M. +P.M. +P.M P.M. +AM A.M. +A.M. +A.M A.M. +A M A.M. \ No newline at end of file diff --git a/itn/english/data/time/time_zone.tsv b/itn/english/data/time/time_zone.tsv new file mode 100644 index 0000000..3e0ade4 --- /dev/null +++ b/itn/english/data/time/time_zone.tsv @@ -0,0 +1,7 @@ +cst c s t +cet c e t +pst p s t +est e s t +pt p t +et e t +gmt g m t diff --git a/itn/english/data/time/time_zone_cased.tsv b/itn/english/data/time/time_zone_cased.tsv new file mode 100644 index 0000000..65c70da --- /dev/null +++ b/itn/english/data/time/time_zone_cased.tsv @@ -0,0 +1,7 @@ +CST C S T +CET C E T +PST P S T +EST E S T +PT P T +ET E T +GMT G M T diff --git a/itn/english/data/time/to_hour.tsv b/itn/english/data/time/to_hour.tsv new file mode 100644 index 0000000..fba67f1 --- /dev/null +++ b/itn/english/data/time/to_hour.tsv @@ -0,0 +1,12 @@ +one 12 +two 1 +three 2 +four 3 +five 4 +six 5 +seven 6 +eight 7 +nine 8 +ten 9 +eleven 10 +twelve 11 \ No newline at end of file diff --git a/itn/english/data/whitelist.tsv b/itn/english/data/whitelist.tsv new file mode 100644 index 0000000..c7201ae --- /dev/null +++ b/itn/english/data/whitelist.tsv @@ -0,0 +1,417 @@ +10K ten k +1D one d +1G one g +1K one k +2.5G two point five g +2D two d +2G two g +2K two k +3D three d +3G three g +4D four d +4G four g +4K four k +5D five d +5G five g +6G six g +7-eleven seven eleven +7G seven g +8K eight k +AAA triple a +AC a c +ACL a c l +ADAS a das +AEB a e b +AES a e s +AGX a g x +AI a i +AI ai +AI ais +ALU a l u +AMD a m d +API a p i +API api +ARP a r p +ASAP a s a p +ASAP a sap +ASMC a s m c +ASR a s r +ASUS a sus +ASUS asus +AT&T a t and t +ATM a t m +AV1 a v one +AV1 av one +AVAS a v a s +AVAS a vas +AVRCP a v r c p +AWS a w s +AWS aws +AlphaFold alpha fold +AstraZeneca astra zeneca +AstraZeneca astrazeneca +Audio2Face audio to face +Audio2Face audio too face +BBC b b c +BCP b c p +BEV b e v +BGA b g a +BGP b g p +BIOS Bios +BIOS bios +BIS b i s +BSP b s p +BasePOD base pod +BasePOD basepod +Billion-X billion X +C# c sharp +C++ c plus plus +CAT5 cat five +CAT5e cat five e +CAT6 cat six +CAT6A cat six a +CAT7 cat seven +CAT8 cat eight +CBC c b c +CCS c c s +CCTV c c t v +CCTV c c tv +CDN c d n +CEO c e o +CES c e s +CFO c f o +CHAdeMO cha demo +CHAdeMO chai demo +CLI c l i +CMO c m o +CMS c m s +CNN c n n +COVID covid +COVID-19 covid nineteen +CPU c p u +CSO c s o +CSP c s p +CSS c s s +CV c v +CVT c v t +CalTech cal tech +CalTech caltech +CapEx cap ex +CapEx capex +ChatGPT chat g p t +CloudXR cloud x r +ConnectX connect x +Core 2 core two +DALI d a l i +DB d b +DC d c +DCDB d c d b +DDR d d r +DDoS d d o s +DDoS d dos +DGL d g l +DGX d g x -1.1 +DGX A100 d g x a hundred -1.1 +DGX A100 d g x a one hundred -1.1 +DGX Station d g x station -1.1 +DHCP d h c p +DLSS d l s s +DLSS 2 d l s s two +DLSS 3 d l s s three +DNA d n a +DNN d n n +DNS d n s +DOS dos +DP d p +DPDT d p d t +DPU d p u +DPX d p x +DeepMind deep mind +DeepMind deepmind +DeepStream deep stream +DeepStream deepstream +DevOps dev ops +Direct3D direct three d +DisplayPort display port +EBC e b c +EBS e b s +EC2 e c two +ECC e c c +ECS e c s +EFS e f s +EGX e g x +EKS e k s +ELB e l b +ELCB e l c b +EMR e m r +EOL e o l +EV e v +Earth-2 earth two +FIFO fee foe +FIFO fifo +FOMO foe moe +FOMO fomo +FPS f p s +FSA f s a +FST f s t +FTP f t p +FTPS f t p s +FX f x +G-SYNC g sync +GCC g c c +GCP g c p +GCS g c s +GFN G f n +GPU g p u +GRPC g r p c +GRU g r u +GSK g s k +GTC g t c +GTX g t x -1.1 +GUI g u i +GUI gui +GeForce geforce +GeForce NOW geforce now +H.264 H two six four +H.264 H two sixty four +H.265 H two six five +H.265 H two sixty five +H100 H one hundred +HDMI h d m i +HEVC h e v c +HGX h g x +HMD Hmd +HPC h p c +HQ h q +HSM h s m +HT h t +HTML h t m l +HTTP h t t p +HTTPS h t t p s +HuggingFace hugging face +I/O i o +ICMP i c m p +IEC i e c +IEEE i triple e +IGMP i g m p +IGX i g x +IISc i i s c +IIT i i t +INT8 int eight +IP i p +IPSec i p sec +ISC i s c +ISI i s i +InfiniBand inifiband +IoT i o t +Isaac SIM isaac sim +JSON j son +JavaScript java script +JavaScript javascript +KVM k v m +LFP lee fei po +LFP life po +LGA l g a -1.1 +LLC l l c +LLM l l m +LLMs l l ms +LLP l l p +LSTM l s t m +LT l t +LTT l t t +LinkedIn linked in +LinkedIn linkedin +MB m b +MCB m c b +MCCB m c c b +MCP m c p +MDM m d m +MFA m f a +MIT m i t +ML m l +MLPerf m l perf +MONAI monai +MWC m w c +MXNet m x net +MacBook mac book +MacBook macbook +MacBook Pro mac book pro +Max-Q max q +Million-X million x +MoCap mo cap +NC n c +NGC n g c +NLP n l p +NLU n l u +NMC n m c +NO n o +NTP n t p +NVDec n v dec +NVDec n v deck +NVEnc n v enc +NVEnc n v inc +NVEnc n v ink +NVMe n v m e +Ni-CD knee cad +NiMH n i m h +NoSQL no s q l +NoSQL no sql +Nvidia A100 nvidia a hundred +Nvidia A100 nvidia a one hundred +Nvidia A30 nvidia a thirty +Nvidia A40 nvidia a forty +OCI o c i +OEM o e m +OEMs o e ms +OLTC o l t c +OLTP o l t p +ONNX o n n x +OVX o v x +OpEx op ex +OpEx opex +OpenCL open c l +OpenGL open g l +PCI p c i +PCIe p c i e -1.1 +PDF p d f +PHEV p h e v +POP3 pop three +PSO p s o +Photoshop photo shop +PhysX fizz ex +PhysX fizz x +PhysX phys ex +PhysX phys x +PlayStation play station +Premiere Pro premiere pro +ProVis pro vis +ProVis provis +PyG pi g +PyG py g +PyTorch pi torch +QODA q o d a +QOS q o s +RCBO r c b o +RCCB r c c b +RDS r d s +RFP r f p +RISC risc +RL r l +RNN r n n +ROI r o i +RSS r s s +RT r t +RTP r t p +RTX r t x -1.1 +RTX ON r t x on +RTX Off r t x off +RacerX racer x +Ryzen 2 rye zen two +Ryzen 3 rye zen three +Ryzen 5 rye zen five +Ryzen 7 rye zen seven +Ryzen 9 rye zen nine +S&P s and p +S3 s three +SAE s a e +SDK s d k +SEO s e o +SFU s f u +SHIELD TV shield tv +SIGGRAPH sig graph +SIGGRAPH siggraph +SMPS s m p s +SMTP s m t p +SNMP s n m p +SNS s n s +SOL s o l +SPDT s p d t +SPN s p n +SPST s p s t +SQL s q l +SQS s q s +SSD s s d +SSH s s h +SSL s s l +SSO s s o +SSR s s r +SUV s u v +SWG s w g +SaaS saas +ServiceNow service now +SuperCloud super cloud +SuperPOD super pod +TCP t c p +TCP/IP t c p i p +TFTP t f t p +TJ t j +TOS t o s +TPM t p m +TPMS t p m s +TPN t p n +TSL t s l +TSMC t s m c +TTS t t s +Telnet tell net +TensorRT tensor r t +Tick-Tock tick tock +TikTok app tick tock app +TikTok is tick tock is +UDP u d p +UEFI u e f i +UFI u f i +UPS u p s +USB u s b +VAAPI v a a p i +VAAPI v a api +VDI v d i +VDPAU v d p a u +VDPAU v d paw +VDPAU v d pow +VFD v f d +VLAN v lan +VP v p +VPN v p n +Vcc v c c +Vdd v d d +VoIP v o i p +WFSA w f s a +WFST w f s t +Wi-Fi wi fi +Wi-Fi wifi +XFX x f x +XFX x fx +XML x m l +Zen 2 zen two +Zen 3 zen three +Zen 4 zen four +c# c sharp +c++ c plus plus +cuBLAS cue blah +cuBLAS cue blahs +cuBLAS q blahs +cuDNN c u d n n +cuDNN cue d n n +cuDNN q d n n +cuOpt cue opt +cuOpt q opt +cuQuantum cue quantum +cuQuantum q quantum +dr. doctor +e.g. for example +es3 e s three +ext4 e x t four +i3 i three +i5 i five +i7 i seven +i9 i nine +iOS ios +iTPMS i t p m s +mr. mister +mrs. misses +on TikTok on tick tock +sFTP s f t p +st. saint +x1 x one +x2 x two +x8 x eight \ No newline at end of file diff --git a/itn/english/data/whitelist_tech.tsv b/itn/english/data/whitelist_tech.tsv new file mode 100644 index 0000000..a2a14a4 --- /dev/null +++ b/itn/english/data/whitelist_tech.tsv @@ -0,0 +1,450 @@ +10K ten k +1D one d +1G one g +1K one k +2.5G two point five g +2D two d +2G two g +2K two k +3D three d +3G three g +4D four d +4G four g +4K four k +5D five d +5G five g +6G six g +7-eleven seven eleven +7G seven g +8K eight k +AAA triple a +AC a c +ACL a c l +ADAS a das +AEB a e b +AES a e s +AGX a g x +AI a i +AI ai +AI ais +ALU a l u +AMD a m d +API a p i +API api +ARP a r p +ASAP a s a p +ASAP a sap +ASMC a s m c +ASR a s r +ASUS a sus +ASUS asus +AT&T a t and t +ATM a t m +AV1 a v one +AV1 av one +AVAS a v a s +AVAS a vas +AVRCP a v r c p +AWS a w s +AWS aws +AlphaFold alpha fold +AstraZeneca astra zeneca +AstraZeneca astrazeneca +Audio2Face audio to face +Audio2Face audio too face +BBC b b c +BCP b c p +BEV b e v +BGA b g a +BGP b g p +BIOS Bios +BIOS bios +BIS b i s +BSP b s p +BasePOD base pod +BasePOD basepod +Billion-X billion X +C# c sharp +C++ c plus plus +CAT5 cat five +CAT5e cat five e +CAT6 cat six +CAT6A cat six a +CAT7 cat seven +CAT8 cat eight +CBC c b c +CCS c c s +CCTV c c t v +CCTV c c tv +CDN c d n +CEO c e o +CES c e s +CFO c f o +CHAdeMO cha demo +CHAdeMO chai demo +CLI c l i +CMO c m o +CMS c m s +CNN c n n +COVID covid +COVID-19 covid nineteen +CPU c p u +CSO c s o +CSP c s p +CSS c s s +CV c v +CVT c v t +CalTech cal tech +CalTech caltech +CapEx cap ex +CapEx capex +ChatGPT chat g p t +CloudXR cloud x r +ConnectX connect x +Core 2 core two +DALI d a l i +DB d b +DC d c +DCDB d c d b +DDR d d r +DDoS d d o s +DDoS d dos +DGL d g l +DGX d g x -1.1 +DGX A100 d g x a hundred -1.1 +DGX A100 d g x a one hundred -1.1 +DGX Station d g x station -1.1 +DHCP d h c p +DLSS d l s s +DLSS 2 d l s s two +DLSS 3 d l s s three +DNA d n a +DNN d n n +DNS d n s +DOS dos +DP d p +DPDT d p d t +DPU d p u +DPX d p x +DeepMind deep mind +DeepMind deepmind +DeepStream deep stream +DeepStream deepstream +DevOps dev ops +Direct3D direct three d +DisplayPort display port +EBC e b c +EBS e b s +EC2 e c two +ECC e c c +ECS e c s +EFS e f s +EGX e g x +EKS e k s +ELB e l b +ELCB e l c b +EMR e m r +EOL e o l +EV e v +Earth-2 earth two +FAT12 fat twelve +FAT16 fat sixteen +FAT32 fat thirty two +FIFO fee foe +FIFO fifo +FOMO foe moe +FOMO fomo +FPS f p s +FSA f s a +FST f s t +FTP f t p +FTPS f t p s +FX f x +G-SYNC g sync +GCC g c c +GCP g c p +GCS g c s +GFN G f n +GPL G P L +GPLv2 G P L two +GPLv3 G P L three +GPU g p u +GRPC g r p c +GRU g r u +GSK g s k +GTC g t c +GTX g t x -1.1 +GUI g u i +GUI gui +GeForce geforce +GeForce NOW geforce now +H.264 H two six four +H.264 H two sixty four +H.265 H two six five +H.265 H two sixty five +H100 H one hundred +HDMI h d m i +HEVC h e v c +HFS h f s +HFS+ h f s plus +HGX h g x +HMD Hmd +HPC h p c +HQ h q +HSM h s m +HT h t +HTML h t m l +HTTP h t t p +HTTPS h t t p s +HuggingFace hugging face +I/O i o +ICMP i c m p +IEC i e c +IEEE i triple e +IGMP i g m p +IGX i g x +IISc i i s c +IIT i i t +INT8 int eight +IP i p +IPSec i p sec +ISC i s c +ISI i s i +InfiniBand inifiband +IoT i o t +Isaac SIM isaac sim +JSON j son +JavaScript java script +JavaScript javascript +KVM k v m +LFP lee fei po +LFP life po +LGA l g a -1.1 +LLC l l c +LLM l l m +LLMs l l ms +LLP l l p +LSTM l s t m +LT l t +LTT l t t +LinkedIn linked in +LinkedIn linkedin +MB m b +MCB m c b +MCCB m c c b +MCP m c p +MDM m d m +MFA m f a +MIT m i t +ML m l +MLPerf m l perf +MONAI monai +MWC m w c +MXNet m x net +MacBook mac book +MacBook macbook +MacBook Pro mac book pro +Max-Q max q +Million-X million x +MoCap mo cap +NC n c +NGC n g c +NLP n l p +NLU n l u +NMC n m c +NO n o +NTFS n t f s +NTP n t p +NVDec n v dec +NVDec n v deck +NVEnc n v enc +NVEnc n v inc +NVEnc n v ink +NVMe n v m e +Ni-CD knee cad +NiMH n i m h +NoSQL no s q l +NoSQL no sql +Nvidia A100 Nvidia a hundred +Nvidia A100 Nvidia a one hundred +Nvidia A100 nvidia a hundred +Nvidia A100 nvidia a one hundred +Nvidia A30 Nvidia a thirty +Nvidia A30 nvidia a thirty +Nvidia A40 Nvidia a forty +Nvidia A40 nvidia a forty +OCI o c i +OEM o e m +OEMs o e ms +OLTC o l t c +OLTP o l t p +ONNX o n n x +OVX o v x +OpEx op ex +OpEx opex +OpenCL open c l +OpenGL open g l +PCI p c i +PCIe p c i e -1.1 +PDF p d f +PHEV p h e v +POP3 pop three +PSO p s o +Photoshop photo shop +PhysX fizz ex +PhysX fizz x +PhysX phys ex +PhysX phys x +PlayStation play station +Premiere Pro premiere pro +ProVis pro vis +ProVis provis +PyG pi g +PyG py g +PyTorch pi torch +QODA q o d a +QOS q o s +RCBO r c b o +RCCB r c c b +RDS r d s +RFP r f p +RISC risc +RL r l +RNN r n n +ROI r o i +RSS r s s +RT r t +RTP r t p +RTX r t x -1.1 +RTX ON r t x on +RTX Off r t x off +RacerX racer x +Ryzen 2 rye zen two +Ryzen 3 rye zen three +Ryzen 5 rye zen five +Ryzen 7 rye zen seven +Ryzen 9 rye zen nine +S&P s and p +S3 s three +SAE s a e +SDK s d k +SEO s e o +SFU s f u +SHIELD TV shield tv +SIGGRAPH sig graph +SIGGRAPH siggraph +SMPS s m p s +SMTP s m t p +SNMP s n m p +SNS s n s +SOL s o l +SPDT s p d t +SPN s p n +SPST s p s t +SQL s q l +SQS s q s +SSD s s d +SSH s s h +SSL s s l +SSO s s o +SSR s s r +SUV s u v +SWG s w g +SaaS saas +ServiceNow service now +SuperCloud super cloud +SuperPOD super pod +TCP t c p +TCP/IP t c p i p +TFTP t f t p +TJ t j +TOS t o s +TPM t p m +TPMS t p m s +TPN t p n +TSL t s l +TSMC t s m c +TTS t t s +Telnet tell net +TensorRT tensor r t +The A100 The a hundred +The A100 The a one hundred +The A100 the a hundred +The A100 the a one hundred +The A30 The a thirty +The A30 the a thirty +The A40 The a forty +The A40 the a forty +Tick-Tock tick tock +TikTok app tick tock app +TikTok is tick tock is +UDP u d p +UEFI u e f i +UFI u f i +UFS u f s +UPS u p s +USB u s b +VAAPI v a a p i +VAAPI v a api +VDI v d i +VDPAU v d p a u +VDPAU v d paw +VDPAU v d pow +VFD v f d +VLAN v lan +VMFS v m f s +VP v p +VPN v p n +Vcc v c c +Vdd v d d +VoIP v o i p +WFSA w f s a +WFST w f s t +Wi-Fi wi fi +Wi-Fi wifi +XFX x f x +XFX x fx +XML x m l +ZFS z f s +Zen 2 zen two +Zen 3 zen three +Zen 4 zen four +b-tree b t ree +btrfs b t r f s +btrfs b tree f s +btrfs butter f s +c# c sharp +c++ c plus plus +cuBLAS cue blah +cuBLAS cue blahs +cuBLAS q blahs +cuDNN c u d n n +cuDNN cue d n n +cuDNN q d n n +cuOpt cue opt +cuOpt q opt +cuQuantum cue quantum +cuQuantum q quantum +dr. doctor +e.g. for example +es3 e s three +exFAT ex fat +ext3 e x t three +ext4 e x t four +i3 i three +i5 i five +i7 i seven +i9 i nine +iOS ios +iTPMS i t p m s +mr. mister +mrs. misses +on TikTok on tick tock +sFTP s f t p +st. saint +x1 x one +x16 x sixteen +x2 x two +x4 x four +x8 x eight +xfs x f s diff --git a/itn/english/data/year_suffix.tsv b/itn/english/data/year_suffix.tsv new file mode 100644 index 0000000..65a05a1 --- /dev/null +++ b/itn/english/data/year_suffix.tsv @@ -0,0 +1,6 @@ +AD a d +BC b c +BCE b c e +BCE before common era +CE c e +CE common era diff --git a/itn/english/inverse_normalizer.py b/itn/english/inverse_normalizer.py new file mode 100644 index 0000000..cf27ced --- /dev/null +++ b/itn/english/inverse_normalizer.py @@ -0,0 +1,47 @@ +# Copyright (c) 2022 Xingchen Song (sxc19@tsinghua.org.cn) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from importlib_resources import files +from pynini.lib.pynutil import add_weight, delete + +from itn.english.rules.cardinal import Cardinal +from itn.english.rules.date import Date +from itn.english.rules.ordinal import Ordinal +from itn.english.rules.word import Word +from tn.processor import Processor + + +class InverseNormalizer(Processor): + + def __init__(self, cache_dir=None, overwrite_cache=False): + super().__init__(name="en_inverse_normalizer", ordertype="en_itn") + if cache_dir is None: + cache_dir = files("itn") + self.build_fst("en_itn", cache_dir, overwrite_cache) + + def build_tagger(self): + tagger = ( + add_weight(Cardinal().tagger, 1.0) + | add_weight(Ordinal().tagger, 1.09) + | add_weight(Date().tagger, 1.09) + | add_weight(Word().tagger, 100) + ).optimize() + + tagger = tagger.star + # remove the last space + self.tagger = tagger @ self.build_rule(delete(" "), "", "[EOS]") + + def build_verbalizer(self): + verbalizer = (Cardinal().verbalizer | Date().verbalizer | Ordinal().verbalizer | Word().verbalizer).optimize() + self.verbalizer = verbalizer.star diff --git a/itn/english/rules/__init__.py b/itn/english/rules/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/itn/english/rules/cardinal.py b/itn/english/rules/cardinal.py new file mode 100644 index 0000000..5b40b84 --- /dev/null +++ b/itn/english/rules/cardinal.py @@ -0,0 +1,216 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pynini import closure, cross, difference, project, string_file, union +from pynini.lib.pynutil import delete, insert + +from itn.english.rules.utils import num_to_word +from tn.processor import Processor +from tn.utils import get_abs_path + + +class Cardinal(Processor): + """ + Finite state transducer for classifying cardinals + e.g. minus twenty three -> cardinal { integer: "23" negative: "-" } } + Numbers below thirteen are not converted. + + Args: + input_case: accepting either "lower_cased" or "cased" input. + """ + + def __init__(self): + super().__init__("cardinal") + self.build_tagger() + self.build_verbalizer() + + def delete_word(self, word: str): + """Capitalizes word for `cased` input""" + delete_graph = delete(word).optimize() + if len(word) > 0: + delete_graph |= delete(word[0].upper() + word[1:]) + return delete_graph.optimize() + + def build_tagger(self): + graph_zero = string_file(get_abs_path("../itn/english/data/numbers/zero.tsv")) + graph_digit = string_file(get_abs_path("../itn/english/data/numbers/digit.tsv")) + graph_ties = string_file(get_abs_path("../itn/english/data/numbers/ties.tsv")) + graph_teen = string_file(get_abs_path("../itn/english/data/numbers/teen.tsv")) + self.graph_two_digit = graph_teen | ((graph_ties) + self.DELETE_SPACE + (graph_digit | insert("0"))) + graph_hundred = cross("hundred", "") + + graph_hundred_component = union(graph_digit + self.DELETE_SPACE + graph_hundred, insert("0")) + graph_hundred_component += self.DELETE_SPACE + graph_hundred_component += union( + graph_teen | insert("00"), (graph_ties | insert("0")) + self.DELETE_SPACE + (graph_digit | insert("0")) + ) + + graph_hundred_component_at_least_one_none_zero_digit = graph_hundred_component @ ( + closure(self.DIGIT) + (self.DIGIT - "0") + closure(self.DIGIT) + ) + self.graph_hundred_component_at_least_one_none_zero_digit = graph_hundred_component_at_least_one_none_zero_digit + + # Transducer for eleven hundred -> 1100 or twenty one hundred eleven -> 2111 + graph_hundred_as_thousand = union(graph_teen, graph_ties + self.DELETE_SPACE + graph_digit) + graph_hundred_as_thousand += self.DELETE_SPACE + graph_hundred + graph_hundred_as_thousand += self.DELETE_SPACE + union( + graph_teen | insert("00"), + (graph_ties | insert("0")) + self.DELETE_SPACE + (graph_digit | insert("0")), + ) + + graph_hundreds = graph_hundred_component | graph_hundred_as_thousand + + graph_ties_component = union( + graph_teen | insert("00"), + (graph_ties | insert("0")) + self.DELETE_SPACE + (graph_digit | insert("0")), + ) + + graph_ties_component_at_least_one_none_zero_digit = graph_ties_component @ ( + closure(self.DIGIT) + (self.DIGIT - "0") + closure(self.DIGIT) + ) + self.graph_ties_component_at_least_one_none_zero_digit = graph_ties_component_at_least_one_none_zero_digit + + # %%% International numeric format + graph_thousands = union( + graph_hundred_component_at_least_one_none_zero_digit + self.DELETE_SPACE + self.delete_word("thousand"), + insert("000", weight=0.1), + ) + + graph_million = union( + graph_hundred_component_at_least_one_none_zero_digit + self.DELETE_SPACE + self.delete_word("million"), + insert("000", weight=0.1), + ) + graph_billion = union( + graph_hundred_component_at_least_one_none_zero_digit + self.DELETE_SPACE + self.delete_word("billion"), + insert("000", weight=0.1), + ) + graph_trillion = union( + graph_hundred_component_at_least_one_none_zero_digit + self.DELETE_SPACE + self.delete_word("trillion"), + insert("000", weight=0.1), + ) + graph_quadrillion = union( + graph_hundred_component_at_least_one_none_zero_digit + self.DELETE_SPACE + self.delete_word("quadrillion"), + insert("000", weight=0.1), + ) + graph_quintillion = union( + graph_hundred_component_at_least_one_none_zero_digit + self.DELETE_SPACE + self.delete_word("quintillion"), + insert("000", weight=0.1), + ) + graph_sextillion = union( + graph_hundred_component_at_least_one_none_zero_digit + self.DELETE_SPACE + self.delete_word("sextillion"), + insert("000", weight=0.1), + ) + # %%% + + graph_int = ( + graph_sextillion + + self.DELETE_SPACE + + graph_quintillion + + self.DELETE_SPACE + + graph_quadrillion + + self.DELETE_SPACE + + graph_trillion + + self.DELETE_SPACE + + graph_billion + + self.DELETE_SPACE + + graph_million + + self.DELETE_SPACE + + graph_thousands + ) + + # %% Indian numeric format simple https://en.wikipedia.org/wiki/Indian_numbering_system + # This only covers "standard format". + # Conventional format like thousand crores/lakh crores is yet to be implemented + graph_in_thousands = union( + graph_ties_component_at_least_one_none_zero_digit + self.DELETE_SPACE + delete("thousand"), + insert("00", weight=0.1), + ) + graph_in_lakhs = union( + graph_ties_component_at_least_one_none_zero_digit + self.DELETE_SPACE + (delete("lakh") | delete("lakhs")), + insert("00", weight=0.1), + ) + + graph_in_crores = union( + graph_ties_component_at_least_one_none_zero_digit + + self.DELETE_SPACE + + (delete("crore") | delete("crores")), + insert("00", weight=0.1), + ) + + graph_in_arabs = union( + graph_ties_component_at_least_one_none_zero_digit + self.DELETE_SPACE + (delete("arab") | delete("arabs")), + insert("00", weight=0.1), + ) + + graph_in_kharabs = union( + graph_ties_component_at_least_one_none_zero_digit + + self.DELETE_SPACE + + (delete("kharab") | delete("kharabs")), + insert("00", weight=0.1), + ) + + graph_in_nils = union( + graph_ties_component_at_least_one_none_zero_digit + self.DELETE_SPACE + (delete("nil") | delete("nils")), + insert("00", weight=0.1), + ) + + graph_in_padmas = union( + graph_ties_component_at_least_one_none_zero_digit + + self.DELETE_SPACE + + (delete("padma") | delete("padmas")), + insert("00", weight=0.1), + ) + + graph_in_shankhs = union( + graph_ties_component_at_least_one_none_zero_digit + + self.DELETE_SPACE + + (delete("shankh") | delete("shankhs")), + insert("00", weight=0.1), + ) + + graph_ind = ( + graph_in_shankhs + + self.DELETE_SPACE + + graph_in_padmas + + self.DELETE_SPACE + + graph_in_nils + + self.DELETE_SPACE + + graph_in_kharabs + + self.DELETE_SPACE + + graph_in_arabs + + self.DELETE_SPACE + + graph_in_crores + + self.DELETE_SPACE + + graph_in_lakhs + + self.DELETE_SPACE + + graph_in_thousands + ) + + graph = union((graph_int | graph_ind) + self.DELETE_SPACE + graph_hundreds, graph_zero) + graph = graph @ union(delete(closure("0")) + difference(self.DIGIT, "0") + closure(self.DIGIT), "0") + + labels_exception = [num_to_word(x) for x in range(0, 13)] + labels_exception += [x.capitalize() for x in labels_exception] + graph_exception = union(*labels_exception).optimize() + + graph = (self.build_rule(delete("and"), self.SPACE, self.SPACE) @ (self.ALPHA + self.SIGMA) @ graph).optimize() + graph |= ((self.TO_LOWER + self.SIGMA) @ graph).optimize() + + self.graph_no_exception = graph + self.graph = (project(graph, "input") - graph_exception.arcsort()) @ graph + tagger = ( + insert('value: "') + cross(union("minus", "Minus") + self.DELETE_SPACE, "-").ques + self.graph + insert('"') + ) + self.tagger = self.add_tokens(tagger).optimize() diff --git a/itn/english/rules/date.py b/itn/english/rules/date.py new file mode 100644 index 0000000..f34f5e8 --- /dev/null +++ b/itn/english/rules/date.py @@ -0,0 +1,183 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pynini import accep, closure, cross, string_file, union +from pynini.lib.pynutil import add_weight, delete, insert + +from itn.english.rules.ordinal import Ordinal +from tn.processor import Processor +from tn.utils import get_abs_path + + +class Date(Processor): + """ + Finite state transducer for classifying date, + e.g. january fifth twenty twelve -> date { month: "january" day: "5" year: "2012" preserve_order: true } + e.g. the fifth of january twenty twelve -> date { day: "5" month: "january" year: "2012" preserve_order: true } + e.g. twenty twenty -> date { year: "2012" preserve_order: true } + + Args: + ordinal: OrdinalFst + input_case: accepting either "lower_cased" or "cased" input. + """ + + def __init__(self): + super().__init__("date") + self.build_tagger() + self.build_verbalizer() + + def build_tagger(self): + graph_teen = string_file(get_abs_path("../itn/english/data/numbers/teen.tsv")).optimize() + graph_digit = string_file(get_abs_path("../itn/english/data/numbers/digit.tsv")).optimize() + ties_graph = string_file(get_abs_path("../itn/english/data/numbers/ties.tsv")).optimize() + month_graph = string_file(get_abs_path("../itn/english/data/months.tsv")) + financial_period_graph = string_file(get_abs_path("../itn/english/data/date_period.tsv")).invert() + graph_ad_bc = self.DELETE_SPACE + string_file(get_abs_path("../itn/english/data/year_suffix.tsv")).invert() + + def _get_ties_graph(): + """ + Transducer for 20-99 e.g + twenty three -> 23 + """ + graph = ties_graph + (self.DELETE_SPACE + self.DIGIT | insert("0")) + graph |= ((self.TO_LOWER + self.SIGMA) @ graph).optimize() + return graph + + def _get_range_graph(): + """ + Transducer for decades (1**0s, 2**0s), centuries (2*00s, 1*00s), millennia (2000s) + """ + graph_ties = _get_ties_graph() + graph = (graph_ties | graph_teen) + self.DELETE_SPACE + cross("hundreds", "00s") + graph |= cross("two", "2") + self.DELETE_SPACE + cross("thousands", "000s") + graph |= ( + (graph_ties | graph_teen) + + self.DELETE_SPACE + + (closure(self.ALPHA, 1) + (cross("ies", "y") | delete("s"))) @ (graph_ties | cross("ten", "10")) + + insert("s") + ) + graph @= union("1", "2") + self.DIGIT + self.DIGIT + self.DIGIT + "s" + graph |= ((self.TO_LOWER + self.SIGMA) @ graph).optimize() + return graph + + def _get_year_graph(): + """ + Transducer for year, e.g. twenty twenty -> 2020 + """ + + def _get_digits_graph(): + zero = cross((accep("oh") | accep("o")), "0") + graph = zero + self.DELETE_SPACE + graph_digit + graph |= ((self.TO_LOWER + self.SIGMA) @ graph).optimize() + return graph + + def _get_thousands_graph(): + graph_ties = _get_ties_graph() + graph_hundred_component = (graph_digit + self.DELETE_SPACE + delete("hundred")) | insert("0") + optional_end = closure(delete("and "), 0, 1) + graph = ( + graph_digit + + self.DELETE_SPACE + + delete("thousand") + + self.DELETE_SPACE + + graph_hundred_component + + self.DELETE_SPACE + + (graph_teen | graph_ties | (optional_end + insert("0") + graph_digit)) + ) + graph |= ((self.TO_LOWER + self.SIGMA) @ graph).optimize() + return graph + + graph_ties = _get_ties_graph() + graph_digits = _get_digits_graph() + graph_thousands = _get_thousands_graph() + + year_graph = ( + # 20 19, 40 12, 2012 - assuming no limit on the year + (graph_teen + self.DELETE_SPACE + (graph_ties | graph_digits | graph_teen)) + | (graph_ties + self.DELETE_SPACE + (graph_ties | graph_digits | graph_teen)) + | graph_thousands + | ((graph_digit + self.DELETE_SPACE + (graph_ties | graph_digits | graph_teen)) + graph_ad_bc) + | ( + (graph_digit | graph_teen | graph_digits | graph_ties | graph_thousands) + + self.DELETE_SPACE + + graph_ad_bc + ) + | ( + (graph_ties + self.DELETE_SPACE + (graph_ties | graph_digits | graph_teen)) + + self.DELETE_SPACE + + graph_ad_bc + ) + | ( + ( + (graph_teen | graph_digit) + + self.DELETE_SPACE + + delete("hundred") + + insert("0") + + (graph_digit | insert("0")) + ) + + self.DELETE_SPACE + + graph_ad_bc + ) + ) + year_graph.optimize() + year_graph |= ((self.TO_LOWER + self.SIGMA) @ year_graph).optimize() + return year_graph + + year_graph = _get_year_graph() + YEAR_WEIGHT = 0.001 + year_graph = add_weight(year_graph, YEAR_WEIGHT) + month_graph = insert('month: "') + month_graph + insert('"') + + day_graph = insert('day: "') + add_weight(Ordinal().graph, -0.7) + insert('"') + graph_year = self.DELETE_EXTRA_SPACE + insert('year: "') + add_weight(year_graph, -YEAR_WEIGHT) + insert('"') + optional_graph_year = closure(graph_year, 0, 1) + graph_mdy = month_graph + ( + (self.DELETE_EXTRA_SPACE + day_graph) | graph_year | (self.DELETE_EXTRA_SPACE + day_graph + graph_year) + ) + the_graph = delete("the") + the_graph |= delete("The").optimize() + + graph_dmy = ( + the_graph + + self.DELETE_SPACE + + day_graph + + self.DELETE_SPACE + + delete("of") + + self.DELETE_EXTRA_SPACE + + month_graph + + optional_graph_year + ) + + period_fy = insert('text: "') + financial_period_graph + (cross(" ", "") | cross(" of ", "")) + insert('"') + graph_year = insert('year: "') + (year_graph | _get_range_graph()) + insert('"') + graph_fy = period_fy + insert(" ") + graph_year + tagger = graph_mdy | graph_dmy | graph_year | graph_fy + tagger += insert(" preserve_order: \"true\"") + self.tagger = self.add_tokens(tagger).optimize() + + def build_verbalizer(self): + month = delete('month: "') + closure(self.NOT_QUOTE, 1) + delete('"') + day = delete('day: "') + closure(self.NOT_QUOTE, 1) + delete('"') + year = delete('year: "') + closure(self.NOT_QUOTE, 1) + delete('"') + period = delete('text: "') + closure(self.NOT_QUOTE, 1) + delete('"') + graph_fy = period + closure(self.DELETE_EXTRA_SPACE + year, 0, 1) + # month (day) year + graph_mdy = month + closure(self.DELETE_EXTRA_SPACE + day, 0, 1) + closure(self.DELETE_EXTRA_SPACE + year, 0, 1) + # (day) month year + graph_dmy = closure(day + self.DELETE_EXTRA_SPACE, 0, 1) + month + closure(self.DELETE_EXTRA_SPACE + year, 0, 1) + + verbalizer = graph_mdy | year | graph_dmy | graph_fy + verbalizer = (graph_mdy | year | graph_dmy | graph_fy) + delete(" preserve_order: \"true\"") + self.verbalizer = self.delete_tokens(verbalizer).optimize() diff --git a/itn/english/rules/ordinal.py b/itn/english/rules/ordinal.py new file mode 100644 index 0000000..9f8c922 --- /dev/null +++ b/itn/english/rules/ordinal.py @@ -0,0 +1,70 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pynini import closure, cross, string_file, union +from pynini.lib.pynutil import insert + +from itn.english.rules.cardinal import Cardinal +from tn.processor import Processor +from tn.utils import get_abs_path + + +class Ordinal(Processor): + """ + Finite state transducer for classifying ordinal + e.g. thirteenth -> ordinal { integer: "13" } + + Args: + cardinal: CardinalFst + input_case: accepting either "lower_cased" or "cased" input. + """ + + def __init__(self): + super().__init__("ordinal") + self.build_tagger() + self.build_verbalizer() + + def build_tagger(self): + graph_digit = string_file(get_abs_path("../itn/english/data/ordinals/digit.tsv")) + graph_teens = string_file(get_abs_path("../itn/english/data/ordinals/teen.tsv")) + graph = closure(self.CHAR) + union(graph_digit, graph_teens, cross("tieth", "ty"), cross("th", "")) + + self.graph = graph @ Cardinal().graph_no_exception + self.graph |= ((self.TO_LOWER + self.SIGMA) @ self.graph).optimize() + + tagger = insert('value: "') + self.graph + insert('"') + self.tagger = self.add_tokens(tagger).optimize() + + def build_verbalizer(self): + super().build_verbalizer() + convert_eleven = cross("11", "11th") + convert_twelve = cross("12", "12th") + convert_thirteen = cross("13", "13th") + convert_one = cross("1", "1st") + convert_two = cross("2", "2nd") + convert_three = cross("3", "3rd") + convert_rest = insert("th", weight=0.01) + suffix = self.build_rule( + convert_eleven + | convert_twelve + | convert_thirteen + | convert_one + | convert_two + | convert_three + | convert_rest, + "", + "[EOS]", + ) + self.verbalizer = (self.verbalizer @ suffix) .optimize() diff --git a/itn/english/rules/utils.py b/itn/english/rules/utils.py new file mode 100644 index 0000000..d73d7b9 --- /dev/null +++ b/itn/english/rules/utils.py @@ -0,0 +1,34 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Union + +import inflect + +_inflect = inflect.engine() + + +def num_to_word(x: Union[str, int]): + """ + converts integer to spoken representation + + Args + x: integer + + Returns: spoken representation + """ + if isinstance(x, int): + x = str(x) + x = _inflect.number_to_words(str(x)).replace("-", " ").replace(",", "") + return x diff --git a/itn/english/rules/word.py b/itn/english/rules/word.py new file mode 100644 index 0000000..4cb4bf9 --- /dev/null +++ b/itn/english/rules/word.py @@ -0,0 +1,34 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pynini.lib.pynutil import insert + +from tn.processor import Processor + + +class Word(Processor): + """ + Finite state transducer for classifying plain tokens, that do not belong to any special class. This can be considered as the default class. + e.g. sleep -> word { value: "sleep" } + """ + + def __init__(self): + super().__init__("word") + self.build_tagger() + self.build_verbalizer() + + def build_tagger(self): + word = insert('value: "') + self.NOT_SPACE.star + insert('"') + self.tagger = self.add_tokens(word.optimize()) diff --git a/itn/english/test/__init__.py b/itn/english/test/__init__.py new file mode 100644 index 0000000..8b82801 --- /dev/null +++ b/itn/english/test/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/itn/english/test/data/cardinal.txt b/itn/english/test/data/cardinal.txt new file mode 100644 index 0000000..270837b --- /dev/null +++ b/itn/english/test/data/cardinal.txt @@ -0,0 +1,60 @@ +nine trillion seven hundred eighty nine billion three hundred eighty two million five hundred thirty six thousand one hundred thirty => 9789382536130 +two hundred and fifty four => 254 +one hundred forty seven thousand four hundred fifty one => 147451 +one million one hundred fifty six thousand one hundred seventy three => 1156173 +one billion five hundred ninety three million seventy two thousand nine hundred sixty one => 1593072961 +One Billion five hundred ninety three Million seventy two thousand nine hundred sixty one => 1593072961 +ninety seven quadrillion eight hundred eight trillion two hundred sixty four billion seven hundred seventy two million seven hundred ninety two thousand five => 97808264772792005 +seventeen sextillion eight hundred fifty five quintillion thirty six quadrillion six hundred fifty seven trillion seven billion five hundred ninety six million one hundred ten thousand nine hundred forty nine => 17855036657007596110949 +ten quadrillion ten trillion ten million one hundred thousand ten => 10010000010100010 +minus twenty five thousand thirty seven => -25037 +one quadrillion two hundred sixty four trillion three hundred one billion nine hundred thirty eight million one hundred four => 1264301938000104 +minus sixty => -60 +forty six thousand six hundred sixty four => 46664 +sixty => 60 +zero => zero +two million three => 2000003 +one thousand thirteen => 1013 +one thousand one => 1001 +one thousand one hundred => 1100 +one thousand twenty six => 1026 +one thousand one hundred twenty six => 1126 +eighteen million four hundred fifty thousand nine hundred ninety => 18450990 +eighteen million nine hundred forty thousand seven hundred twenty two => 18940722 +eighteen million six hundred ninety thousand nine hundred sixteen => 18690916 +eighteen thousand eight hundred eighty => 18880 +eleven hundred => 1100 +twenty one hundred => 2100 +twenty one hundred and eleven => 2111 +eleven hundred twenty one => 1121 +Nine trillion seven hundred eighty nine billion three hundred eighty two million five hundred thirty six thousand one hundred thirty => 9789382536130 +Two hundred and fifty four => 254 +One hundred forty seven thousand four hundred fifty one => 147451 +One million one hundred fifty six thousand one hundred seventy three => 1156173 +One billion five hundred ninety three million seventy two thousand nine hundred sixty one => 1593072961 +Ninety seven quadrillion eight hundred eight trillion two hundred sixty four billion seven hundred seventy two million seven hundred ninety two thousand five => 97808264772792005 +Seventeen sextillion eight hundred fifty five quintillion thirty six quadrillion six hundred fifty seven trillion seven billion five hundred ninety six million one hundred ten thousand nine hundred forty nine => 17855036657007596110949 +Ten quadrillion ten trillion ten million one hundred thousand ten => 10010000010100010 +Minus twenty five thousand thirty seven => -25037 +One quadrillion two hundred sixty four trillion three hundred one billion nine hundred thirty eight million one hundred four => 1264301938000104 +Minus sixty => -60 +Forty six thousand six hundred sixty four => 46664 +Sixty => 60 +Zero => Zero +Two million three => 2000003 +One thousand thirteen => 1013 +One thousand one => 1001 +One thousand one hundred => 1100 +One thousand twenty six => 1026 +One thousand one hundred twenty six => 1126 +Eighteen million four hundred fifty thousand nine hundred ninety => 18450990 +Eighteen million nine hundred forty thousand seven hundred twenty two => 18940722 +Eighteen million six hundred ninety thousand nine hundred sixteen => 18690916 +Eighteen thousand eight hundred eighty => 18880 +Eleven hundred => 1100 +Twenty one hundred => 2100 +Twenty one hundred and eleven => 2111 +Eleven hundred twenty one => 1121 +Nineteen => 19 +Twelve => Twelve +twenty one crore ninety eight lakh thirty six thousand five hundred and ninety three => 219836593 diff --git a/itn/english/test/data/date.txt b/itn/english/test/data/date.txt new file mode 100644 index 0000000..5845363 --- /dev/null +++ b/itn/english/test/data/date.txt @@ -0,0 +1,70 @@ +july twenty fifth two thousand twelve => july 25 2012 +nineteen eighties => 1980s +two thousand and twenty => 2020 +two thousand and nine => 2009 +the twenty fifth of july twenty twelve => 25 july 2012 +the twenty fifth of july two thousand twelve => 25 july 2012 +the twenty second of july twenty twelve => 22 july 2012 +the fifteenth of january => 15 january +the seventeenth of may twenty ten => 17 may 2010 +january first => january 1 +july twenty second two thousand eight => july 22 2008 +june thirty => june 30 +july twenty fifth twenty twelve => july 25 2012 +nineteen seventeen => 1917 +twenty twelve => 2012 +march sixteen sixty five => march 1665 +sixteen sixty five => 1665 +july two thousand twelve => july 2012 +october nineteen oh five => october 1905 +july fifteen o six => july 1506 +the twenty fifth of july twenty twelve => 25 july 2012 +july twenty fifth twenty twelve => july 25 2012 +july twenty fifth two thousand twelve => july 25 2012 +july one thousand eight hundred seventy six => july 1876 +february twenty fifth twenty sixteen => february 25 2016 +november twenty fourth twenty fourteen => november 24 2014 +nineteen ninety four => 1994 +two thousand three => 2003 +one thousand eight => 1008 +nineteen seventy six => 1976 +june twentieth twenty fourteen => june 20 2014 +nineteen seventy three => 1973 +nineteen seventy five => 1975 +eleven fifty five => 1155 +July twenty fifth two thousand twelve => July 25 2012 +Nineteen eighties => 1980s +Two thousand and twenty => 2020 +Two thousand and nine => 2009 +The twenty fifth of july twenty twelve => 25 july 2012 +The twenty fifth of july two thousand twelve => 25 july 2012 +The twenty second of july twenty twelve => 22 july 2012 +The fifteenth of january => 15 january +The fifteenth of January => 15 January +The seventeenth of may twenty ten => 17 may 2010 +January first => January 1 +July twenty second two thousand eight => July 22 2008 +June thirty => June 30 +July twenty fifth twenty twelve => July 25 2012 +Nineteen seventeen => 1917 +Twenty twelve => 2012 +March sixteen sixty five => March 1665 +Sixteen sixty five => 1665 +July two thousand twelve => July 2012 +October nineteen oh five => October 1905 +July fifteen o six => July 1506 +The twenty fifth of july twenty twelve => 25 july 2012 +The twenty fifth of July twenty twelve => 25 July 2012 +July twenty fifth twenty twelve => July 25 2012 +July twenty fifth two thousand twelve => July 25 2012 +July one thousand eight hundred seventy six => July 1876 +February twenty fifth twenty sixteen => February 25 2016 +November twenty fourth twenty fourteen => November 24 2014 +Nineteen ninety four => 1994 +Two thousand three => 2003 +One thousand eight => 1008 +Nineteen seventy six => 1976 +June twentieth twenty fourteen => June 20 2014 +Nineteen seventy three => 1973 +Nineteen seventy five => 1975 +Eleven fifty five => 1155 diff --git a/itn/english/test/data/debug.txt b/itn/english/test/data/debug.txt new file mode 100644 index 0000000..190ac52 --- /dev/null +++ b/itn/english/test/data/debug.txt @@ -0,0 +1,10 @@ +july twenty fifth two thousand twelve => july 25 2012 +nineteen eighties => 1980s +two thousand and twenty => 2020 +two thousand and nine => 2009 +the twenty fifth of july twenty twelve => 25 july 2012 +the twenty fifth of july two thousand twelve => 25 july 2012 +the twenty second of july twenty twelve => 22 july 2012 +the fifteenth of january => 15 january +the seventeenth of may twenty ten => 17 may 2010 +january first => january 1 diff --git a/itn/english/test/data/decimal.txt b/itn/english/test/data/decimal.txt new file mode 100644 index 0000000..46570c7 --- /dev/null +++ b/itn/english/test/data/decimal.txt @@ -0,0 +1,129 @@ +five point two million => 5.2 million +one hundred sixty four point five eight thousand => 164.58 thousand +four hundred million => 400 million +fifty billion => 50 billion +four hundred five billion => 405 billion +four point eight five billion => 4.85 billion +one hundred billion => 100 billion +one hundred ten billion => 110 billion +one hundred thirty two billion => 132 billion +one point eight four billion => 1.84 billion +one point eight one billion => 1.81 billion +one point five nine billion => 1.59 billion +one point four five three billion => 1.453 billion +one point seven two billion => 1.72 billion +one point two five billion => 1.25 billion +thirteen billion => 13 billion +thirty billion => 30 billion +two thousand eight hundred five point eight seven three billion => 2805.873 billion +seventy trillion => 70 trillion +thirteen million => 13 million +eighteen billion => 18 billion +four hundred fifty million => 450 million +one hundred thirty million => 130 million +ten million => 10 million +four hundred million => 400 million +five million => 5 million +five hundred million => 500 million +twelve million => 12 million +thirteen million => 13 million +four million => 4 million +forty five million => 45 million +fifteen million => 15 million +fifteen trillion => 15 trillion +fifteen billion => 15 billion +two million => 2 million +eight million => 8 million +point one two o five => .1205 +minus sixty point two four zero zero => -60.2400 +zero point two six => 0.26 +point zero two => .02 +sixty point two => 60.2 +eighteen => 18 +eighteen point eight five => 18.85 +eighteen point five o => 18.50 +eighteen point five six => 18.56 +eighteen point nine => 18.9 +eighteen point o five => 18.05 +eighteen point one two => 18.12 +eighteen point o one => 18.01 +eighteen point o o o => 18.000 +eighteen point six => 18.6 +eighteen point three o o => 18.300 +eighteen point three six => 18.36 +eighteen point two five => 18.25 +eighteen point two two => 18.22 +eight hundred eighteen point three o three => 818.303 +eight hundred eight point eight => 808.8 +eight hundred eight point zero => 808.0 +eight hundred eighty eight point one => 888.1 +eight hundred eighty four point three => 884.3 +eight hundred eighty two point eight => 882.8 +eight hundred eighty two point zero => 882.0 +eight hundred forty five point nine four => 845.94 +Five point two million => 5.2 million +One hundred sixty four point five eight thousand => 164.58 thousand +Four hundred million => 400 million +Fifty billion => 50 billion +Four hundred five billion => 405 billion +Four point eight five billion => 4.85 billion +One hundred billion => 100 billion +One hundred ten billion => 110 billion +One hundred thirty two billion => 132 billion +One point eight four billion => 1.84 billion +One point eight four Billion => 1.84 Billion +One point eight one billion => 1.81 billion +One point five nine billion => 1.59 billion +One point four five three billion => 1.453 billion +One point seven two billion => 1.72 billion +One point two five billion => 1.25 billion +Thirteen billion => 13 billion +Thirty billion => 30 billion +Thirty Billion => 30 Billion +Two thousand eight hundred five point eight seven three billion => 2805.873 billion +Seventy trillion => 70 trillion +Thirteen million => 13 million +Eighteen billion => 18 billion +Four hundred fifty million => 450 million +One hundred thirty million => 130 million +Ten million => 10 million +Four hundred million => 400 million +Five million => 5 million +Five hundred million => 500 million +Twelve million => 12 million +Thirteen million => 13 million +Four million => 4 million +Forty five million => 45 million +Fifteen million => 15 million +Fifteen trillion => 15 trillion +Fifteen billion => 15 billion +Fifteen Billion => 15 Billion +Two million => 2 million +Eight million => 8 million +Point one two o five => .1205 +Minus sixty point two four zero zero => -60.2400 +Zero point two six => 0.26 +Point zero two => .02 +Sixty point two => 60.2 +Eighteen => 18 +Eighteen point eight five => 18.85 +Eighteen point five o => 18.50 +Eighteen point five six => 18.56 +Eighteen point nine => 18.9 +Eighteen point o five => 18.05 +Eighteen point one two => 18.12 +Eighteen point o one => 18.01 +Eighteen point o o o => 18.000 +Eighteen point six => 18.6 +Eighteen point three o o => 18.300 +Eighteen point three six => 18.36 +Eighteen point two five => 18.25 +Eighteen point two two => 18.22 +Eight hundred eighteen point three o three => 818.303 +Eight hundred eight point eight => 808.8 +Eight hundred eight point zero => 808.0 +Eight hundred eighty eight point one => 888.1 +Eight hundred eighty four point three => 884.3 +Eight hundred eighty two point eight => 882.8 +Eight hundred eighty two point zero => 882.0 +Eight hundred forty five point nine four => 845.94 diff --git a/itn/english/test/data/electronic.txt b/itn/english/test/data/electronic.txt new file mode 100644 index 0000000..556e8be --- /dev/null +++ b/itn/english/test/data/electronic.txt @@ -0,0 +1,59 @@ +a dot b c at g mail dot com => a.bc@gmail.com +a at gmail dot com => a@gmail.com +a at m s n dot fr => a@msn.fr +a at a o l dot com => a@aol.com +a at m s n dot com => a@msn.com +a at nvidia dot com => a@nvidia.com +a dot b c at nvidia dot com => a.bc@nvidia.com +c d f at a b c dot e d u => cdf@abc.edu +a b c at g mail dot a b c => abc@gmail.abc +a b c at a b c dot com => abc@abc.com +a s d f one two three at a b c dot com => asdf123@abc.com +a one b two at a b c dot com => a1b2@abc.com +a b three dot s d d dot three at g mail dot com => ab3.sdd.3@gmail.com +dot three at g mail dot com => dot 3@gmail.com +one three at g mail dot com => 13@gmail.com +a b three hyphen s d d dash three at g mail dot com => ab3-sdd-3@gmail.com +h t t p colon slash slash w w w dot o u r d a i l y n e w s dot com dot s m => http://www.ourdailynews.com.sm +h t t p colon slash slash w w w dot c o m d a i l y n e w s dot a b dot s m => http://www.comdailynews.ab.sm +h t t p colon slash slash w w w dot c o m d a i l y n e w s dot a b slash s m => http://www.comdailynews.ab/sm +w w w dot c o m d a i l y n e w s dot a b slash s m => www.comdailynews.ab/sm +c o m d a i l y n e w s dot a b slash s m => comdailynews.ab/sm +n vidia dot com => nvidia.com +abc at gmail dot com => abc@gmail.com +athreed at gmail dot com => athreed@gmail.com +kore dot ai => kore.ai +A dot b c at g mail dot com => A.bc@gmail.com +A at gmail dot com => A@gmail.com +A at m s n dot fr => A@msn.fr +A at a o l dot com => A@aol.com +A at m s n dot com => A@msn.com +A at nvidia dot com => A@nvidia.com +A dot b c at nvidia dot com => A.bc@nvidia.com +C d f at a b c dot e d u => Cdf@abc.edu +A b c at g mail dot a b c => Abc@gmail.abc +A b c at a b c dot com => Abc@abc.com +A s d f one two three at a b c dot com => Asdf123@abc.com +A one b two at a b c dot com => A1b2@abc.com +A b three dot s d d dot three at g mail dot com => Ab3.sdd.3@gmail.com +Dot three at g mail dot com => Dot 3@gmail.com +One three at g mail dot com => 13@gmail.com +A b three hyphen s d d dash three at g mail dot com => Ab3-sdd-3@gmail.com +H t t p colon slash slash w w w dot o u r d a i l y n e w s dot com dot s m => http://www.ourdailynews.com.sm +HTTP colon slash slash w w w dot o u r d a i l y n e w s dot com dot s m => http://www.ourdailynews.com.sm +HTTP colon slash slash w w w dot o u r d a i l y n e w s dot com dot Zero one => http://www.ourdailynews.com.01 +http colon slash slash w w w dot o u r d a i l y n e w s dot com dot SM and s => http://www.ourdailynews.com.SM and s +Http colon slash slash w w w dot o u r d a i l y n e w s dot COM dot s m => http://www.ourdailynews.COM.sm +H t t p colon slash slash w w w dot c o m d a i l y n e w s dot a b dot s m => http://www.comdailynews.ab.sm +H t t p colon slash slash w w w dot c o m d a i l y n e w s dot a b slash s m => http://www.comdailynews.ab/sm +W w w dot c o m d a i l y n e w s dot a b slash s m => www.comdailynews.ab/sm +C o m d a i l y n e w s dot a b slash s m => Comdailynews.ab/sm +N vidia dot com => nvidia.com +Abc at gmail dot com => Abc@gmail.com +Athreed at gmail dot com => Athreed@gmail.com +Kore dot ai => Kore.ai +NVIDIA dot com => NVIDIA.com +NVIDIA dot COM => NVIDIA.COM +WWW.A B C at A B C dot com => WWW.A BC@ABC.com +W W W. A B C dot com => www.ABC.com +w w w . o u r d a i l y n e w s dot com . s m => www.ourdailynews.com . s m diff --git a/itn/english/test/data/measure.txt b/itn/english/test/data/measure.txt new file mode 100644 index 0000000..64fa978 --- /dev/null +++ b/itn/english/test/data/measure.txt @@ -0,0 +1,232 @@ +two hundred meters => 200 m +fifty six point three per square kilometer => 56.3 /km² +two hundred kilometers per hour => 200 km/h +twenty eight kilograms force per square centimeter => 28 kgf/cm² +forty two thousand two hundred fifty nine per square meter => 42259 /m² +minus two thousand twelve kilo liters => -2012 kl +minus sixty six kilograms => -66 kg +two kilo watt hours => 2 kWh +one point o o o o two eight cubic deci meters => 1.000028 dm³ +seven point five peta bytes => 7.5 pb +three hours => 3 h +one milli volt => 1 mv +two cubic meters => 2 m³ +ninety grams => 90 g +one hundred twenty four point three lumens => 124.3 lm +four hundred forty milliliters => 440 ml +thirty one thousand four hundred eighty square feet => 31480 sq ft +one thousand six hundred hours => 1600 h +thirty one thousand four hundred eighty square feet => 31480 sq ft +two square miles => 2 sq mi +zero point one nine square miles => 0.19 sq mi +one thousand five hundred thirty one c c => 1531 cc +three hundred micrometers => 300 μm +sixty five thousand square kilometers => 65000 km² +two miles per hour => 2 mph +two hundred forty five miles per hour => 245 mph +one hundred fifty c c => 150 cc +sixty point two four zero zero kilograms => 60.2400 kg +zero feet => 0 ft +zero foot => 0 ft +two feet => 2 ft +twenty foot => 20 ft +point two meters => .2 m +two square meters => 2 m² +eighteen feet => 18 ft +eighteen mega siemens => 18 ms +eighteen ounces => 18 oz +eighteen point five kilometers => 18.5 km +eighteen point five two square kilometers => 18.52 km² +eighteen point nine one square kilometers => 18.91 km² +eighteen point one four percent => 18.14 % +eighteen point one six percent => 18.16 % +eighteen point one square kilometers => 18.1 km² +eighteen point six percent => 18.6 % +eighteen point two two kilometers => 18.22 km +eighteen point zero kilometers => 18.0 km +eighteen point zero percent => 18.0 % +eighteen square kilometers => 18 km² +eighteen thousand eight hundred giga watt hours => 18800 gWh +eighteen thousand seven hundred hectares => 18700 ha +eight hectares => 8 ha +eight hundred eighty five astronomical units => 885 au +eight hundred eighty hectares => 880 ha +eight hundred eighty kilobytes => 880 kb +eight hundred eighty kilometers => 880 km +eight hundred eighty nine feet => 889 ft +eight hundred eighty six kilometers => 886 km +eight hundred eighty two megawatts => 882 mW +eight hundred feet => 800 ft +eight hundred fifty five square kilometers => 855 km² +eight hundred fifty megahertz => 850 mhz +eight hundred fifty meters => 850 m +eight hundred fifty nanometers => 850 nm +eight hundred fifty one meters => 851 m +eight hundred fifty seven square kilometers => 857 km² +eight hundred fifty three meters => 853 m +eight hundred fifty three point six meters => 853.6 m +eight hundred five point four six square kilometers => 805.46 km² +eight hundred forty two point nine meters => 842.9 m +eight hundred forty two square kilometers => 842 km² +eight hundred gigabytes => 800 gb +eight hundred horsepower => 800 hp +eight hundred kilograms => 800 kg +eight hundred kilo watt hours => 800 kWh +eight hundred kilowatts => 800 kW +eight hundred megahertz => 800 mhz +eight hundred ninety four c c => 894 cc +eight hundred ninety kilowatts => 890 kW +eight hundred ninety millimeters => 890 mm +eight hundred ninety two square kilometers => 892 km² +eight hundred seventy horsepower => 870 hp +eight hundred seventy meters => 870 m +eight hundred sixty kilograms => 860 kg +eight hundred sixty kilometers => 860 km +eight hundred sixty miles => 860 mi +eight hundred sixty six feet => 866 ft +eight hundred ten hectares => 810 ha +eight hundred ten kilohertz => 810 khz +eight hundred thirty eight point two millimeters => 838.2 mm +eight hundred thirty five kilometers => 835 km +eight hundred thirty kilohertz => 830 khz +eight hundred thirty megawatts => 830 mW +eight hundred thirty nine kilometers => 839 km +eight hundred thirty six meters => 836 m +eight hundred twenty feet => 820 ft +eight hundred twenty kilometers => 820 km +eight hundred twenty meters => 820 m +eight hundred twenty one point zero feet => 821.0 ft +eight hundred two point eight nine kilometers => 802.89 km +eight hundred volts => 800 v +eight kilobits => 8 kb +eight kilograms => 8 kg +eight million two hundred thousand feet => 8200000 ft +eight point eight kilometers => 8.8 km +eight point eight meters => 8.8 m +eight point eight miles => 8.8 mi +eight point five centimeters => 8.5 cm +eight point five five percent => 8.55 % +eight point five megawatts => 8.5 mW +eight point five meters => 8.5 m +eight point five two percent => 8.52 % +eight point four four percent => 8.44 % +Two hundred meters => 200 m +Fifty six point three per square kilometer => 56.3 /km² +Two hundred kilometers per hour => 200 km/h +Twenty eight kilograms force per square centimeter => 28 kgf/cm² +Forty two thousand two hundred fifty nine per square meter => 42259 /m² +Minus two thousand twelve kilo liters => -2012 kl +Minus sixty six kilograms => -66 kg +Two kilo watt hours => 2 kWh +One point o o o o two eight cubic deci meters => 1.000028 dm³ +Seven point five peta bytes => 7.5 pb +Three hours => 3 h +One milli volt => 1 mv +Two cubic meters => 2 m³ +Ninety grams => 90 g +One hundred twenty four point three lumens => 124.3 lm +Four hundred forty milliliters => 440 ml +Thirty one thousand four hundred eighty square feet => 31480 sq ft +One thousand six hundred hours => 1600 h +Thirty one thousand four hundred eighty square feet => 31480 sq ft +Two square miles => 2 sq mi +Zero point one nine square miles => 0.19 sq mi +One thousand five hundred thirty one c c => 1531 cc +Three hundred micrometers => 300 μm +Sixty five thousand square kilometers => 65000 km² +Two miles per hour => 2 mph +Two hundred forty five miles per hour => 245 mph +One hundred fifty c c => 150 cc +Sixty point two four zero zero kilograms => 60.2400 kg +Zero feet => 0 ft +Zero foot => 0 ft +Two feet => 2 ft +Twenty foot => 20 ft +Point two meters => .2 m +Two square meters => 2 m² +Eighteen feet => 18 ft +Eighteen mega siemens => 18 ms +Eighteen ounces => 18 oz +Eighteen point five kilometers => 18.5 km +Eighteen point five two square kilometers => 18.52 km² +Eighteen point nine one square kilometers => 18.91 km² +Eighteen point one four percent => 18.14 % +Eighteen point one six percent => 18.16 % +Eighteen point one square kilometers => 18.1 km² +Eighteen point six percent => 18.6 % +Eighteen point two two kilometers => 18.22 km +Eighteen point zero kilometers => 18.0 km +Eighteen point zero percent => 18.0 % +Eighteen square kilometers => 18 km² +Eighteen thousand eight hundred giga watt hours => 18800 gWh +Eighteen thousand seven hundred hectares => 18700 ha +Eight hectares => 8 ha +Eight hundred eighty five astronomical units => 885 au +Eight hundred eighty hectares => 880 ha +Eight hundred eighty kilobytes => 880 kb +Eight hundred eighty kilometers => 880 km +Eight hundred eighty nine feet => 889 ft +Eight hundred eighty six kilometers => 886 km +Eight hundred eighty two megawatts => 882 mW +Eight hundred feet => 800 ft +Eight hundred fifty five square kilometers => 855 km² +Eight hundred fifty megahertz => 850 mhz +Eight hundred fifty meters => 850 m +Eight hundred fifty nanometers => 850 nm +Eight hundred fifty one meters => 851 m +Eight hundred fifty seven square kilometers => 857 km² +Eight hundred fifty three meters => 853 m +Eight hundred fifty three point six meters => 853.6 m +Eight hundred five point four six square kilometers => 805.46 km² +Eight hundred forty two point nine meters => 842.9 m +Eight hundred forty two square kilometers => 842 km² +Eight hundred gigabytes => 800 gb +Eight hundred horsepower => 800 hp +Eight hundred kilograms => 800 kg +Eight hundred kilo watt hours => 800 kWh +Eight hundred kilowatts => 800 kW +Eight hundred megahertz => 800 mhz +Eight hundred ninety four c c => 894 cc +Eight hundred ninety kilowatts => 890 kW +Eight hundred ninety millimeters => 890 mm +Eight hundred ninety two square kilometers => 892 km² +Eight hundred seventy horsepower => 870 hp +Eight hundred seventy meters => 870 m +Eight hundred sixty kilograms => 860 kg +Eight hundred sixty kilometers => 860 km +Eight hundred sixty miles => 860 mi +Eight hundred sixty six feet => 866 ft +Eight hundred ten hectares => 810 ha +Eight hundred ten kilohertz => 810 khz +Eight hundred thirty eight point two millimeters => 838.2 mm +Eight hundred thirty five kilometers => 835 km +Eight hundred thirty kilohertz => 830 khz +Eight hundred thirty megawatts => 830 mW +Eight hundred thirty nine kilometers => 839 km +Eight hundred thirty six meters => 836 m +Eight hundred twenty feet => 820 ft +Eight hundred twenty kilometers => 820 km +Eight hundred twenty meters => 820 m +Eight hundred twenty one point zero feet => 821.0 ft +Eight hundred two point eight nine kilometers => 802.89 km +Eight hundred volts => 800 v +Eight kilobits => 8 kb +Eight KiloBits => 8 kb +Eight kilograms => 8 kg +Eight million two hundred thousand feet => 8200000 ft +Eight point eight kilometers => 8.8 km +Eight point eight meters => 8.8 m +Eight point eight miles => 8.8 mi +Eight point five centimeters => 8.5 cm +Eight point five five percent => 8.55 % +Eight point five megawatts => 8.5 mW +Eight point five meters => 8.5 m +Eight point five two percent => 8.52 % +Eight point four four percent => 8.44 % +one gigabit per second => 1 gbps +nine gigabits per second => 9 gbps +five degrees celsius => 5 °C +seventy two degrees fahrenheit => 72 °F +Seventy two Degrees Fahrenheit => 72 °F +two hundred seventy three kelvin => 273 K +Nine GigaBits per second => 9 gbps diff --git a/itn/english/test/data/money.txt b/itn/english/test/data/money.txt new file mode 100644 index 0000000..9729c14 --- /dev/null +++ b/itn/english/test/data/money.txt @@ -0,0 +1,54 @@ +Two dollars => $2 +One cent => $0.01 +Four united states dollars and sixty nine cents => $4.69 +Seventy five dollars sixty three => $75.63 +Twenty nine dollars fifty cents => $29.50 +Eleven dollars and fifty one cents => $11.51 +Nine hundred ninety three dollars and ninety two cents => $993.92 +Four hundred sixty billion won => ₩460 billion +Thirty billion yen => ¥30 billion +Two point five billion dollars => $2.5 billion +Forty five billion dollars => $45 billion +Fifty million dollars => $50 million +Fifty billion dollars => $50 billion +Zero point two million dollars => $0.2 million +Fifteen point two billion dollars => $15.2 billion +One point six nine billion yuan => 1.69 billion yuan +One point four three six billion yuan => 1.436 billion yuan +Four million yuan => 4 million yuan +One dollar => $1 +Fifteen thousand dollars => $15000 +One dollars => One dollars +Twenty dollar => $20 +Twenty point five o six dollars => $20.506 +Point five o six dollars => $.506 +Eighteen dollars => $18 +Eighteen million nine hundred twenty five thousand dollars => $18925000 +Eighteen thousand eight hundred fifty four dollars => $18854 +Eighteen thousand eight hundred one dollars => $18801 +Eighteen thousand eight hundred seventy five dollars => $18875 +Eighteen thousand eighty one dollars => $18081 +Eighteen thousand fifty two dollars => $18052 +Eighteen thousand five hundred forty two dollars => $18542 +Eighteen thousand five hundred nineteen dollars => $18519 +Eighteen thousand five hundred seventy dollars => $18570 +Eighteen thousand five hundred seventy eight dollars => $18578 +Eighteen thousand five hundred sixteen dollars => $18516 +Eighteen thousand four hundred eighty two dollars => $18482 +Eighteen thousand four hundred seventy eight dollars => $18478 +Eighteen thousand four hundred sixty eight dollars => $18468 +Eighteen thousand nine hundred three dollars => $18903 +Eighteen thousand nine hundred twenty nine dollars => $18929 +Eighteen thousand ninety five dollars => $18095 +Eighteen thousand one hundred seventeen dollars => $18117 +Eighteen thousand one hundred twenty eight dollars => $18128 +Eighteen thousand one hundred twenty five dollars => $18125 +Eighteen thousand one hundred twenty four dollars => $18124 +Eighteen thousand one hundred twenty nine dollars => $18129 +One thousand fifty five dollars => $1055 +One fifty five dollars => $155 +Fifteen hundred dollars => $1500 +Ninety nine hundred dollars => $9900 +Ninety nine hundred and fifteen dollars and one cent => $9915.01 +zero point two Million Dollars => $0.2 Million +One Cent => $0.01 diff --git a/itn/english/test/data/ordinal.txt b/itn/english/test/data/ordinal.txt new file mode 100644 index 0000000..04a867f --- /dev/null +++ b/itn/english/test/data/ordinal.txt @@ -0,0 +1,34 @@ +One hundredth => 100th +Twenty five thousand one hundred eleventh => 25111th +Second => 2nd +Zeroth => 0th +First => 1st +Second => 2nd +Third => 3rd +Fourth => 4th +Eleventh => 11th +Twelfth => 12th +Thirteenth => 13th +Twenty first => 21st +Twenty third => 23rd +One hundred eleventh => 111th +One thousandth => 1000th +One hundred twenty first => 121st +Eleven hundred twenty first => 1121st +Second => 2nd +Tenth => 10th +Sixth => 6th +Third => 3rd +Nineteenth => 19th +Third => 3rd +Twelfth => 12th +Forty eighth => 48th +Seventy first => 71st +Third => 3rd +Forty second => 42nd +Seventeenth => 17th +Twentieth => 20th +Twenty first => 21st +Seventh => 7th +Second => 2nd +Fifth => 5th diff --git a/itn/english/test/data/telephone.txt b/itn/english/test/data/telephone.txt new file mode 100644 index 0000000..c2b03d0 --- /dev/null +++ b/itn/english/test/data/telephone.txt @@ -0,0 +1,18 @@ +One two three one two three five six seven eight => 123-123-5678 +Plus nine one one two three one two three five six seven eight => +91 123-123-5678 +Plus forty four one two three one two three five six seven eight => +44 123-123-5678 +Four one two three one two three five six seven eight => 4 123-123-5678 +Zero two three one two three five six seven eight => 023-123-5678 +O two three one two three five six seven eight => 023-123-5678 +Oh two three one two three five six seven eight => 023-123-5678 +Double oh three one two three five six seven eight => 003-123-5678 +Four three two double seven three two one four three two one four three double zero five => 432 7732 143214 3005 +One two three dot one two three dot o dot four o => 123.123.0.40 +One twenty three dot one two three dot o dot four o => 123.123.0.40 +Two two five dot double five dot o dot four o => 225.55.0.40 +Two two five dot double five dot o dot forty five => 225.55.0.45 +SSN is seven double nine one two three double one three => SSN is 799-12-3113 +Seven nine nine => 799 +A b nine => Ab9 +A b c => A b c +Five w k r a three one => 5wkra31 diff --git a/itn/english/test/data/time.txt b/itn/english/test/data/time.txt new file mode 100644 index 0000000..86603ac --- /dev/null +++ b/itn/english/test/data/time.txt @@ -0,0 +1,33 @@ +Eight oclock g m t => 08:00 gmt +Seven a m e s t => 07:00 a.m. est +Two p m => 02:00 p.m. +Seven A M E S T => 07:00 A.M. EST +Two P M => 02:00 P.M. +Two thirty => 02:30 +Three o'clock => 03:00 +Quarter past one => 01:15 +Half past three => 03:30 +Eight fifty one => 08:51 +Eight fifty two => 08:52 +Eight forty => 08:40 +Eight nineteen => 08:19 +Eight o six => 08:06 +Eight thirty eight => 08:38 +Eight thirty two => 08:32 +Eight twenty nine => 08:29 +Eleven fifty five p m => 11:55 p.m. +Eleven fifty three p m => 11:53 p.m. +Eleven forty a m => 11:40 a.m. +Eleven forty five a m => 11:45 a.m. +Eleven forty p m => 11:40 p.m. +Eleven forty six a m => 11:46 a.m. +Eleven o six p m => 11:06 p.m. +Eleven thirteen a m => 11:13 a.m. +Half past twelve => 12:30 +Quarter past one => 01:15 +Quarter to one => 12:45 +Quarter to twelve => 11:45 +Set alarm at ten to eleven pm => Set alarm at 10:50 p.m. +One min to one am => 12:59 a.m. +eleven Forty six A M => 11:46 A.M. +eleven forty six AM => 11:46 A.M. diff --git a/itn/english/test/data/whitelist.txt b/itn/english/test/data/whitelist.txt new file mode 100644 index 0000000..7415241 --- /dev/null +++ b/itn/english/test/data/whitelist.txt @@ -0,0 +1,9 @@ +Doctor dao => Dr. dao +Misses Smith => Mrs. Smith +Misses smith => Mrs. smith +Mister Dao => Mr. Dao +Saint George => St. George +I like for example ice cream => I like e.g. ice cream +S and p five hundred => S&P 500 +S and P Five hundred => S&P 500 +Seven eleven stores => 7-eleven stores diff --git a/itn/english/test/data/word.txt b/itn/english/test/data/word.txt new file mode 100644 index 0000000..3a8d965 --- /dev/null +++ b/itn/english/test/data/word.txt @@ -0,0 +1,49 @@ +E s three => Es3 +Yahoo! => Yahoo! +Twenty! => 20 ! +X => X +— => — +Aaa => Aaa +Aabach => Aabach +Aabenraa => Aabenraa +Aabye => Aabye +Aaccessed => Aaccessed +Aach => Aach +Aachen's => Aachen's +Aadri => Aadri +Aafia => Aafia +Aagaard => Aagaard +Aagadu => Aagadu +Aagard => Aagard +Aagathadi => Aagathadi +Aaghart's => Aaghart's +Aagnes => Aagnes +Aagomoni => Aagomoni +Aagon => Aagon +Aagoo => Aagoo +Aagot => Aagot +Aahar => Aahar +Aahh => Aahh +Aahperd => Aahperd +Aaibinterstate => Aaibinterstate +Aajab => Aajab +Aakasa => Aakasa +Aakervik => Aakervik +Aakirkeby => Aakirkeby +Aalam => Aalam +Aalbaek => Aalbaek +Aaldiu => Aaldiu +Aalem => Aalem +A'ali => A'ali +Aalilaassamthey => Aalilaassamthey +Aalin => Aalin +Aaliyan => Aaliyan +Aaliyan's => Aaliyan's +Aamadu => Aamadu +Aamara => Aamara +Aambala => Aambala +Aamera => Aamera +Aamer's => Aamer's +Aamina => Aamina +Aaminah => Aaminah +Aamjiwnaang => Aamjiwnaang diff --git a/itn/english/test/normalizer_test.py b/itn/english/test/normalizer_test.py new file mode 100644 index 0000000..4eaaa44 --- /dev/null +++ b/itn/english/test/normalizer_test.py @@ -0,0 +1,40 @@ +# Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from itertools import chain + +import pytest + +from itn.english.inverse_normalizer import InverseNormalizer +from itn.english.test.utils import parse_test_case + + +class TestNormalizer: + + normalizer = InverseNormalizer(overwrite_cache=True) + + normalizer_cases = chain( + # parse_test_case("data/cardinal.txt"), + # parse_test_case("data/ordinal.txt"), + # parse_test_case("data/date.txt"), + parse_test_case("data/debug.txt"), + # parse_test_case("data/word.txt"), + ) + + @pytest.mark.parametrize("spoken, written", normalizer_cases) + def test_normalizer(self, spoken, written): + # print(f"spoken: {spoken}, written: {written}") + print(self.normalizer.tag(spoken)) + print(f"actual: {self.normalizer.normalize(spoken)}, expected: {written}") + assert self.normalizer.normalize(spoken) == written diff --git a/itn/english/test/utils.py b/itn/english/test/utils.py new file mode 120000 index 0000000..a2ade7b --- /dev/null +++ b/itn/english/test/utils.py @@ -0,0 +1 @@ +../../../tn/chinese/test/utils.py \ No newline at end of file diff --git a/tn/token_parser.py b/tn/token_parser.py index 38ea184..3c195f1 100644 --- a/tn/token_parser.py +++ b/tn/token_parser.py @@ -33,6 +33,9 @@ "money": ["currency", "value", "decimal"], "time": ["hour", "minute", "second", "noon"], } +EN_ITN_ORDERS = { + "date": ["month", "day", "year", "text"], +} class Token: @@ -68,6 +71,8 @@ def __init__(self, ordertype="tn"): self.orders = ITN_ORDERS elif ordertype == "en_tn": self.orders = EN_TN_ORDERS + elif ordertype == "en_itn": + self.orders = EN_ITN_ORDERS else: raise NotImplementedError()