use core::cmp;
use core::iter::Filter;
use crate::tables::word::WordCat;
pub struct UnicodeWords<'a> {
inner: Filter<UWordBounds<'a>, fn(&&str) -> bool>,
}
impl<'a> Iterator for UnicodeWords<'a> {
type Item = &'a str;
#[inline]
fn next(&mut self) -> Option<&'a str> { self.inner.next() }
}
impl<'a> DoubleEndedIterator for UnicodeWords<'a> {
#[inline]
fn next_back(&mut self) -> Option<&'a str> { self.inner.next_back() }
}
pub struct UnicodeWordIndices<'a> {
inner: Filter<UWordBoundIndices<'a>, fn(&(usize, &str)) -> bool>,
}
impl<'a> Iterator for UnicodeWordIndices<'a> {
type Item = (usize, &'a str);
#[inline]
fn next(&mut self) -> Option<(usize, &'a str)> { self.inner.next() }
}
impl<'a> DoubleEndedIterator for UnicodeWordIndices<'a> {
#[inline]
fn next_back(&mut self) -> Option<(usize, &'a str)> { self.inner.next_back() }
}
#[derive(Clone)]
pub struct UWordBounds<'a> {
string: &'a str,
cat: Option<WordCat>,
catb: Option<WordCat>,
}
#[derive(Clone)]
pub struct UWordBoundIndices<'a> {
start_offset: usize,
iter: UWordBounds<'a>,
}
impl<'a> UWordBoundIndices<'a> {
#[inline]
pub fn as_str(&self) -> &'a str {
self.iter.as_str()
}
}
impl<'a> Iterator for UWordBoundIndices<'a> {
type Item = (usize, &'a str);
#[inline]
fn next(&mut self) -> Option<(usize, &'a str)> {
self.iter.next().map(|s| (s.as_ptr() as usize - self.start_offset, s))
}
#[inline]
fn size_hint(&self) -> (usize, Option<usize>) {
self.iter.size_hint()
}
}
impl<'a> DoubleEndedIterator for UWordBoundIndices<'a> {
#[inline]
fn next_back(&mut self) -> Option<(usize, &'a str)> {
self.iter.next_back().map(|s| (s.as_ptr() as usize - self.start_offset, s))
}
}
#[derive(Clone,Copy,PartialEq,Eq,Debug)]
enum UWordBoundsState {
Start,
Letter,
HLetter,
Numeric,
Katakana,
ExtendNumLet,
Regional(RegionalState),
FormatExtend(FormatExtendType),
Zwj,
Emoji,
WSegSpace,
}
#[derive(Clone,Copy,PartialEq,Eq,Debug)]
enum FormatExtendType {
AcceptAny,
AcceptNone,
RequireLetter,
RequireHLetter,
AcceptQLetter,
RequireNumeric,
}
#[derive(Clone,Copy,PartialEq,Eq,Debug)]
enum RegionalState {
Half,
Full,
Unknown,
}
fn is_emoji(ch: char) -> bool {
use crate::tables::emoji;
emoji::emoji_category(ch).2 == emoji::EmojiCat::EC_Extended_Pictographic
}
impl<'a> Iterator for UWordBounds<'a> {
type Item = &'a str;
#[inline]
fn size_hint(&self) -> (usize, Option<usize>) {
let slen = self.string.len();
(cmp::min(slen, 1), Some(slen))
}
#[inline]
fn next(&mut self) -> Option<&'a str> {
use self::UWordBoundsState::*;
use self::FormatExtendType::*;
use crate::tables::word as wd;
if self.string.len() == 0 {
return None;
}
let mut take_curr = true;
let mut take_cat = true;
let mut idx = 0;
let mut saveidx = 0;
let mut state = Start;
let mut cat = wd::WC_Any;
let mut savecat = wd::WC_Any;
let mut skipped_format_extend = false;
for (curr, ch) in self.string.char_indices() {
idx = curr;
let prev_zwj = cat == wd::WC_ZWJ;
cat = match self.cat {
None => wd::word_category(ch).2,
_ => self.cat.take().unwrap()
};
take_cat = true;
if state != Start {
match cat {
wd::WC_Extend | wd::WC_Format | wd::WC_ZWJ => {
skipped_format_extend = true;
continue
}
_ => {}
}
}
if prev_zwj {
if is_emoji(ch) {
state = Emoji;
continue;
}
}
state = match state {
Start if cat == wd::WC_CR => {
idx += match self.get_next_cat(idx) {
Some(ncat) if ncat == wd::WC_LF => 1,
_ => 0
};
break;
},
Start => match cat {
wd::WC_ALetter => Letter,
wd::WC_Hebrew_Letter => HLetter,
wd::WC_Numeric => Numeric,
wd::WC_Katakana => Katakana,
wd::WC_ExtendNumLet => ExtendNumLet,
wd::WC_Regional_Indicator => Regional(RegionalState::Half),
wd::WC_LF | wd::WC_Newline => break,
wd::WC_ZWJ => Zwj,
wd::WC_WSegSpace => WSegSpace,
_ => {
if let Some(ncat) = self.get_next_cat(idx) {
if ncat == wd::WC_Format || ncat == wd::WC_Extend || ncat == wd::WC_ZWJ {
state = FormatExtend(AcceptNone);
self.cat = Some(ncat);
continue;
}
}
break;
}
},
WSegSpace => match cat {
wd::WC_WSegSpace if !skipped_format_extend => WSegSpace,
_ => {
take_curr = false;
break;
}
},
Zwj => {
take_curr = false;
break;
}
Letter | HLetter => match cat {
wd::WC_ALetter => Letter,
wd::WC_Hebrew_Letter => HLetter,
wd::WC_Numeric => Numeric,
wd::WC_ExtendNumLet => ExtendNumLet,
wd::WC_Double_Quote if state == HLetter => {
savecat = cat;
saveidx = idx;
FormatExtend(RequireHLetter)
},
wd::WC_Single_Quote if state == HLetter => {
FormatExtend(AcceptQLetter)
},
wd::WC_MidLetter | wd::WC_MidNumLet | wd::WC_Single_Quote => {
savecat = cat;
saveidx = idx;
FormatExtend(RequireLetter)
},
_ => {
take_curr = false;
break;
}
},
Numeric => match cat {
wd::WC_Numeric => Numeric,
wd::WC_ALetter => Letter,
wd::WC_Hebrew_Letter => HLetter,
wd::WC_ExtendNumLet => ExtendNumLet,
wd::WC_MidNum | wd::WC_MidNumLet | wd::WC_Single_Quote => {
savecat = cat;
saveidx = idx;
FormatExtend(RequireNumeric)
},
_ => {
take_curr = false;
break;
}
},
Katakana => match cat {
wd::WC_Katakana => Katakana,
wd::WC_ExtendNumLet => ExtendNumLet,
_ => {
take_curr = false;
break;
}
},
ExtendNumLet => match cat {
wd::WC_ExtendNumLet => ExtendNumLet,
wd::WC_ALetter => Letter,
wd::WC_Hebrew_Letter => HLetter,
wd::WC_Numeric => Numeric,
wd::WC_Katakana => Katakana,
_ => {
take_curr = false;
break;
}
},
Regional(RegionalState::Full) => {
take_curr = false;
break;
}
Regional(RegionalState::Half) => match cat {
wd::WC_Regional_Indicator => Regional(RegionalState::Full),
_ => {
take_curr = false;
break;
}
},
Regional(_) => unreachable!("RegionalState::Unknown should not occur on forward iteration"),
Emoji => {
take_curr = false;
break;
},
FormatExtend(t) => match t {
RequireNumeric if cat == wd::WC_Numeric => Numeric,
RequireLetter | AcceptQLetter if cat == wd::WC_ALetter => Letter,
RequireLetter | AcceptQLetter if cat == wd::WC_Hebrew_Letter => HLetter,
RequireHLetter if cat == wd::WC_Hebrew_Letter => HLetter,
AcceptNone | AcceptQLetter => {
take_curr = false;
take_cat = false;
break;
},
_ => break
}
}
}
if let FormatExtend(t) = state {
if t == RequireLetter || t == RequireHLetter || t == RequireNumeric {
idx = saveidx;
cat = savecat;
take_curr = false;
}
}
self.cat = if take_curr {
idx = idx + self.string[idx..].chars().next().unwrap().len_utf8();
None
} else if take_cat {
Some(cat)
} else {
None
};
let retstr = &self.string[..idx];
self.string = &self.string[idx..];
Some(retstr)
}
}
impl<'a> DoubleEndedIterator for UWordBounds<'a> {
#[inline]
fn next_back(&mut self) -> Option<&'a str> {
use self::UWordBoundsState::*;
use self::FormatExtendType::*;
use crate::tables::word as wd;
if self.string.len() == 0 {
return None;
}
let mut take_curr = true;
let mut take_cat = true;
let mut idx = self.string.len();
idx -= self.string.chars().next_back().unwrap().len_utf8();
let mut previdx = idx;
let mut saveidx = idx;
let mut state = Start;
let mut savestate = Start;
let mut cat = wd::WC_Any;
let mut skipped_format_extend = false;
for (curr, ch) in self.string.char_indices().rev() {
previdx = idx;
idx = curr;
cat = match self.catb {
None => wd::word_category(ch).2,
_ => self.catb.take().unwrap()
};
take_cat = true;
if cat == wd::WC_Extend
|| cat == wd::WC_Format
|| (cat == wd::WC_ZWJ && state != Zwj) {
if match state {
FormatExtend(_) | Start => false,
_ => true
} {
saveidx = previdx;
savestate = state;
state = FormatExtend(AcceptNone);
}
if state != Start {
continue;
}
} else if state == FormatExtend(AcceptNone) {
state = savestate;
previdx = saveidx;
take_cat = false;
skipped_format_extend = true;
}
state = match state {
Start | FormatExtend(AcceptAny) => match cat {
_ if is_emoji(ch) => Zwj,
wd::WC_ALetter => Letter,
wd::WC_Hebrew_Letter => HLetter,
wd::WC_Numeric => Numeric,
wd::WC_Katakana => Katakana,
wd::WC_ExtendNumLet => ExtendNumLet,
wd::WC_Regional_Indicator => Regional(RegionalState::Unknown),
wd::WC_Extend | wd::WC_Format | wd::WC_ZWJ => FormatExtend(AcceptAny),
wd::WC_Single_Quote => {
saveidx = idx;
FormatExtend(AcceptQLetter)
},
wd::WC_WSegSpace => WSegSpace,
wd::WC_CR | wd::WC_LF | wd::WC_Newline => {
if state == Start {
if cat == wd::WC_LF {
idx -= match self.get_prev_cat(idx) {
Some(pcat) if pcat == wd::WC_CR => 1,
_ => 0
};
}
} else {
take_curr = false;
}
break;
},
_ => break
},
Zwj => match cat {
wd::WC_ZWJ => {
FormatExtend(AcceptAny)
}
_ => {
take_curr = false;
break;
}
},
WSegSpace => match cat {
wd::WC_WSegSpace if !skipped_format_extend => {
WSegSpace
}
_ => {
take_curr = false;
break;
}
},
Letter | HLetter => match cat {
wd::WC_ALetter => Letter,
wd::WC_Hebrew_Letter => HLetter,
wd::WC_Numeric => Numeric,
wd::WC_ExtendNumLet => ExtendNumLet,
wd::WC_Double_Quote if state == HLetter => {
saveidx = previdx;
FormatExtend(RequireHLetter)
},
wd::WC_MidLetter | wd::WC_MidNumLet | wd::WC_Single_Quote => {
saveidx = previdx;
FormatExtend(RequireLetter)
},
_ => {
take_curr = false;
break;
}
},
Numeric => match cat {
wd::WC_Numeric => Numeric,
wd::WC_ALetter => Letter,
wd::WC_Hebrew_Letter => HLetter,
wd::WC_ExtendNumLet => ExtendNumLet,
wd::WC_MidNum | wd::WC_MidNumLet | wd::WC_Single_Quote => {
saveidx = previdx;
FormatExtend(RequireNumeric)
},
_ => {
take_curr = false;
break;
}
},
Katakana => match cat {
wd::WC_Katakana => Katakana,
wd::WC_ExtendNumLet => ExtendNumLet,
_ => {
take_curr = false;
break;
}
},
ExtendNumLet => match cat {
wd::WC_ExtendNumLet => ExtendNumLet,
wd::WC_ALetter => Letter,
wd::WC_Hebrew_Letter => HLetter,
wd::WC_Numeric => Numeric,
wd::WC_Katakana => Katakana,
_ => {
take_curr = false;
break;
}
},
Regional(mut regional_state) => match cat {
wd::WC_Regional_Indicator => {
if regional_state == RegionalState::Unknown {
let count = self.string[..previdx]
.chars().rev()
.map(|c| wd::word_category(c).2)
.filter(|&c| ! (c == wd::WC_ZWJ || c == wd::WC_Extend || c == wd::WC_Format))
.take_while(|&c| c == wd::WC_Regional_Indicator)
.count();
regional_state = if count % 2 == 0 {
RegionalState::Full
} else {
RegionalState::Half
};
}
if regional_state == RegionalState::Full {
take_curr = false;
break;
} else {
Regional(RegionalState::Full)
}
}
_ => {
take_curr = false;
break;
}
},
Emoji => {
if is_emoji(ch) {
Zwj
} else {
take_curr = false;
break;
}
},
FormatExtend(t) => match t {
RequireNumeric if cat == wd::WC_Numeric => Numeric,
RequireLetter if cat == wd::WC_ALetter => Letter,
RequireLetter if cat == wd::WC_Hebrew_Letter => HLetter,
AcceptQLetter if cat == wd::WC_Hebrew_Letter => HLetter,
RequireHLetter if cat == wd::WC_Hebrew_Letter => HLetter,
_ => break
}
}
}
if let FormatExtend(t) = state {
if t == RequireLetter || t == RequireHLetter ||
t == RequireNumeric || t == AcceptNone || t == AcceptQLetter {
previdx = saveidx;
take_cat = false;
take_curr = false;
}
}
self.catb = if take_curr {
None
} else {
idx = previdx;
if take_cat {
Some(cat)
} else {
None
}
};
let retstr = &self.string[idx..];
self.string = &self.string[..idx];
Some(retstr)
}
}
impl<'a> UWordBounds<'a> {
#[inline]
pub fn as_str(&self) -> &'a str {
self.string
}
#[inline]
fn get_next_cat(&self, idx: usize) -> Option<WordCat> {
use crate::tables::word as wd;
let nidx = idx + self.string[idx..].chars().next().unwrap().len_utf8();
if nidx < self.string.len() {
let nch = self.string[nidx..].chars().next().unwrap();
Some(wd::word_category(nch).2)
} else {
None
}
}
#[inline]
fn get_prev_cat(&self, idx: usize) -> Option<WordCat> {
use crate::tables::word as wd;
if idx > 0 {
let nch = self.string[..idx].chars().next_back().unwrap();
Some(wd::word_category(nch).2)
} else {
None
}
}
}
#[inline]
pub fn new_word_bounds<'b>(s: &'b str) -> UWordBounds<'b> {
UWordBounds { string: s, cat: None, catb: None }
}
#[inline]
pub fn new_word_bound_indices<'b>(s: &'b str) -> UWordBoundIndices<'b> {
UWordBoundIndices { start_offset: s.as_ptr() as usize, iter: new_word_bounds(s) }
}
#[inline]
fn has_alphanumeric(s: &&str) -> bool {
use crate::tables::util::is_alphanumeric;
s.chars().any(|c| is_alphanumeric(c))
}
#[inline]
pub fn new_unicode_words<'b>(s: &'b str) -> UnicodeWords<'b> {
use super::UnicodeSegmentation;
UnicodeWords { inner: s.split_word_bounds().filter(has_alphanumeric) }
}
#[inline]
pub fn new_unicode_word_indices<'b>(s: &'b str) -> UnicodeWordIndices<'b> {
use super::UnicodeSegmentation;
UnicodeWordIndices { inner: s.split_word_bound_indices().filter(|(_, c)| has_alphanumeric(c)) }
}