Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ license = "MIT/Apache-2.0"
keywords = ["text", "unicode", "grapheme", "word", "boundary"]
readme = "README.md"
description = """
This crate provides Grapheme Cluster and Word boundaries
This crate provides Grapheme Cluster, Word and Sentence boundaries
according to Unicode Standard Annex #29 rules.
"""

Expand Down
7 changes: 7 additions & 0 deletions scripts/unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -351,3 +351,10 @@ def emit_break_module(f, break_table, break_cats, name):
word_table.extend([(x, y, cat) for (x, y) in word_cats[cat]])
word_table.sort(key=lambda w: w[0])
emit_break_module(rf, word_table, word_cats.keys(), "word")

sentence_cats = load_properties("auxiliary/SentenceBreakProperty.txt", [])
sentence_table = []
for cat in sentence_cats:
sentence_table.extend([(x, y, cat) for (x, y) in sentence_cats[cat]])
sentence_table.sort(key=lambda w: w[0])
emit_break_module(rf, sentence_table, sentence_cats.keys(), "sentence")
15 changes: 15 additions & 0 deletions scripts/unicode_gen_breaktests.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,8 +190,23 @@ def create_words_data(f):
f.write(" // http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakTest.txt\n")
unicode.emit_table(f, "TEST_WORD", test, wtype, True, showfun, True)

def create_sentence_data(f):
d = load_test_data("auxiliary/SentenceBreakTest.txt")

test = []

for (c, i) in d:
allchars = [cn for s in c for cn in s]
test.append((allchars, c))

wtype = "&'static [(&'static str, &'static [&'static str])]"
f.write(" // official Unicode test data\n")
f.write(" // http://www.unicode.org/Public/UNIDATA/auxiliary/SentenceBreakTest.txt\n")
unicode.emit_table(f, "TEST_SENTENCE", test, wtype, True, showfun, True)

if __name__ == "__main__":
with open("testdata.rs", "w") as rf:
rf.write(unicode.preamble)
create_grapheme_data(rf)
create_words_data(rf)
create_sentence_data(rf)
40 changes: 39 additions & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
// option. This file may not be copied, modified, or distributed
// except according to those terms.

//! Iterators which split strings on Grapheme Cluster or Word boundaries, according
//! Iterators which split strings on Grapheme Cluster, Word or Sentence boundaries, according
//! to the [Unicode Standard Annex #29](http://www.unicode.org/reports/tr29/) rules.
//!
//! ```rust
Expand Down Expand Up @@ -67,10 +67,12 @@ pub use grapheme::{Graphemes, GraphemeIndices};
pub use grapheme::{GraphemeCursor, GraphemeIncomplete};
pub use tables::UNICODE_VERSION;
pub use word::{UWordBounds, UWordBoundIndices, UnicodeWords};
pub use sentence::{USentenceBounds, USentenceBoundIndices, UnicodeSentences};

mod grapheme;
mod tables;
mod word;
mod sentence;

#[cfg(test)]
mod test;
Expand Down Expand Up @@ -174,6 +176,27 @@ pub trait UnicodeSegmentation {
/// assert_eq!(&swi1[..], b);
/// ```
fn split_word_bound_indices<'a>(&'a self) -> UWordBoundIndices<'a>;

/// Returns an iterator over substrings of `self` separated on
/// [UAX#29 sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
///
/// The concatenation of the substrings returned by this function is just the original string.
fn unicode_sentences<'a>(&'a self) -> UnicodeSentences<'a>;

/// Returns an iterator over substrings of `self` separated on
/// [UAX#29 sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
///
/// Here, "sentences" are just those substrings which, after splitting on
/// UAX#29 sentence boundaries, contain any alphanumeric characters. That is, the
/// substring must contain at least one character with the
/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
/// property, or with
/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
fn split_sentence_bounds<'a>(&'a self) -> USentenceBounds<'a>;

/// Returns an iterator over substrings of `self`, split on UAX#29 sentence boundaries,
/// and their offsets. See `split_sentence_bounds()` for more information.
fn split_sentence_bound_indices<'a>(&'a self) -> USentenceBoundIndices<'a>;
}

impl UnicodeSegmentation for str {
Expand Down Expand Up @@ -201,4 +224,19 @@ impl UnicodeSegmentation for str {
fn split_word_bound_indices(&self) -> UWordBoundIndices {
word::new_word_bound_indices(self)
}

#[inline]
fn unicode_sentences(&self) -> UnicodeSentences {
sentence::new_unicode_sentences(self)
}

#[inline]
fn split_sentence_bounds(&self) -> USentenceBounds {
sentence::new_sentence_bounds(self)
}

#[inline]
fn split_sentence_bound_indices(&self) -> USentenceBoundIndices {
sentence::new_sentence_bound_indices(self)
}
}
Loading