std.uni - Phobos documentation

References:: ASCII Table, Wikipedia, The Unicode Consortium, Unicode normalization forms, Unicode text segmentation Unicode Implementation Guidelines Unicode Conformance
Trademarks:: Unicode(tm) is a trademark of Unicode, Inc.
Source:: std/uni.d

**General category**
Abb.	Long form	Abb.	Long form	Abb.	Long form
L	Letter	Cn	Unassigned	Po	Other_Punctuation
Ll	Lowercase_Letter	Co	Private_Use	Ps	Open_Punctuation
Lm	Modifier_Letter	Cs	Surrogate	S	Symbol
Lo	Other_Letter	N	Number	Sc	Currency_Symbol
Lt	Titlecase_Letter	Nd	Decimal_Number	Sk	Modifier_Symbol
Lu	Uppercase_Letter	Nl	Letter_Number	Sm	Math_Symbol
M	Mark	No	Other_Number	So	Other_Symbol
Mc	Spacing_Mark	P	Punctuation	Z	Separator
Me	Enclosing_Mark	Pc	Connector_Punctuation	Zl	Line_Separator
Mn	Nonspacing_Mark	Pd	Dash_Punctuation	Zp	Paragraph_Separator
C	Other	Pe	Close_Punctuation	Zs	Space_Separator
Cc	Control	Pf	Final_Punctuation	-	Any
Cf	Format	Pi	Initial_Punctuation	-	ASCII

**Common binary properties**
Name	Name	Name
Alphabetic	Ideographic	Other_Uppercase
ASCII_Hex_Digit	IDS_Binary_Operator	Pattern_Syntax
Bidi_Control	ID_Start	Pattern_White_Space
Cased	IDS_Trinary_Operator	Quotation_Mark
Case_Ignorable	Join_Control	Radical
Dash	Logical_Order_Exception	Soft_Dotted
Default_Ignorable_Code_Point	Lowercase	STerm
Deprecated	Math	Terminal_Punctuation
Diacritic	Noncharacter_Code_Point	Unified_Ideograph
Extender	Other_Alphabetic	Uppercase
Grapheme_Base	Other_Default_Ignorable_Code_Point	Variation_Selector
Grapheme_Extend	Other_Grapheme_Extend	White_Space
Grapheme_Link	Other_ID_Continue	XID_Continue
Hex_Digit	Other_ID_Start	XID_Start
Hyphen	Other_Lowercase
ID_Continue	Other_Math

**Blocks**
Aegean Numbers	Ethiopic Extended	Mongolian
Alchemical Symbols	Ethiopic Extended-A	Musical Symbols
Alphabetic Presentation Forms	Ethiopic Supplement	Myanmar
Ancient Greek Musical Notation	General Punctuation	Myanmar Extended-A
Ancient Greek Numbers	Geometric Shapes	New Tai Lue
Ancient Symbols	Georgian	NKo
Arabic	Georgian Supplement	Number Forms
Arabic Extended-A	Glagolitic	Ogham
Arabic Mathematical Alphabetic Symbols	Gothic	Ol Chiki
Arabic Presentation Forms-A	Greek and Coptic	Old Italic
Arabic Presentation Forms-B	Greek Extended	Old Persian
Arabic Supplement	Gujarati	Old South Arabian
Armenian	Gurmukhi	Old Turkic
Arrows	Halfwidth and Fullwidth Forms	Optical Character Recognition
Avestan	Hangul Compatibility Jamo	Oriya
Balinese	Hangul Jamo	Osmanya
Bamum	Hangul Jamo Extended-A	Phags-pa
Bamum Supplement	Hangul Jamo Extended-B	Phaistos Disc
Basic Latin	Hangul Syllables	Phoenician
Batak	Hanunoo	Phonetic Extensions
Bengali	Hebrew	Phonetic Extensions Supplement
Block Elements	High Private Use Surrogates	Playing Cards
Bopomofo	High Surrogates	Private Use Area
Bopomofo Extended	Hiragana	Rejang
Box Drawing	Ideographic Description Characters	Rumi Numeral Symbols
Brahmi	Imperial Aramaic	Runic
Braille Patterns	Inscriptional Pahlavi	Samaritan
Buginese	Inscriptional Parthian	Saurashtra
Buhid	IPA Extensions	Sharada
Byzantine Musical Symbols	Javanese	Shavian
Carian	Kaithi	Sinhala
Chakma	Kana Supplement	Small Form Variants
Cham	Kanbun	Sora Sompeng
Cherokee	Kangxi Radicals	Spacing Modifier Letters
CJK Compatibility	Kannada	Specials
CJK Compatibility Forms	Katakana	Sundanese
CJK Compatibility Ideographs	Katakana Phonetic Extensions	Sundanese Supplement
CJK Compatibility Ideographs Supplement	Kayah Li	Superscripts and Subscripts
CJK Radicals Supplement	Kharoshthi	Supplemental Arrows-A
CJK Strokes	Khmer	Supplemental Arrows-B
CJK Symbols and Punctuation	Khmer Symbols	Supplemental Mathematical Operators
CJK Unified Ideographs	Lao	Supplemental Punctuation
CJK Unified Ideographs Extension A	Latin-1 Supplement	Supplementary Private Use Area-A
CJK Unified Ideographs Extension B	Latin Extended-A	Supplementary Private Use Area-B
CJK Unified Ideographs Extension C	Latin Extended Additional	Syloti Nagri
CJK Unified Ideographs Extension D	Latin Extended-B	Syriac
Combining Diacritical Marks	Latin Extended-C	Tagalog
Combining Diacritical Marks for Symbols	Latin Extended-D	Tagbanwa
Combining Diacritical Marks Supplement	Lepcha	Tags
Combining Half Marks	Letterlike Symbols	Tai Le
Common Indic Number Forms	Limbu	Tai Tham
Control Pictures	Linear B Ideograms	Tai Viet
Coptic	Linear B Syllabary	Tai Xuan Jing Symbols
Counting Rod Numerals	Lisu	Takri
Cuneiform	Low Surrogates	Tamil
Cuneiform Numbers and Punctuation	Lycian	Telugu
Currency Symbols	Lydian	Thaana
Cypriot Syllabary	Mahjong Tiles	Thai
Cyrillic	Malayalam	Tibetan
Cyrillic Extended-A	Mandaic	Tifinagh
Cyrillic Extended-B	Mathematical Alphanumeric Symbols	Transport And Map Symbols
Cyrillic Supplement	Mathematical Operators	Ugaritic
Deseret	Meetei Mayek	Unified Canadian Aboriginal Syllabics
Devanagari	Meetei Mayek Extensions	Unified Canadian Aboriginal Syllabics Extended
Devanagari Extended	Meroitic Cursive	Vai
Dingbats	Meroitic Hieroglyphs	Variation Selectors
Domino Tiles	Miao	Variation Selectors Supplement
Egyptian Hieroglyphs	Miscellaneous Mathematical Symbols-A	Vedic Extensions
Emoticons	Miscellaneous Mathematical Symbols-B	Vertical Forms
Enclosed Alphanumerics	Miscellaneous Symbols	Yijing Hexagram Symbols
Enclosed Alphanumeric Supplement	Miscellaneous Symbols and Arrows	Yi Radicals
Enclosed CJK Letters and Months	Miscellaneous Symbols And Pictographs	Yi Syllables
Enclosed Ideographic Supplement	Miscellaneous Technical
Ethiopic	Modifier Tone Letters

**Scripts**
Arabic	Hanunoo	Old_Italic
Armenian	Hebrew	Old_Persian
Avestan	Hiragana	Old_South_Arabian
Balinese	Imperial_Aramaic	Old_Turkic
Bamum	Inherited	Oriya
Batak	Inscriptional_Pahlavi	Osmanya
Bengali	Inscriptional_Parthian	Phags_Pa
Bopomofo	Javanese	Phoenician
Brahmi	Kaithi	Rejang
Braille	Kannada	Runic
Buginese	Katakana	Samaritan
Buhid	Kayah_Li	Saurashtra
Canadian_Aboriginal	Kharoshthi	Sharada
Carian	Khmer	Shavian
Chakma	Lao	Sinhala
Cham	Latin	Sora_Sompeng
Cherokee	Lepcha	Sundanese
Common	Limbu	Syloti_Nagri
Coptic	Linear_B	Syriac
Cuneiform	Lisu	Tagalog
Cypriot	Lycian	Tagbanwa
Cyrillic	Lydian	Tai_Le
Deseret	Malayalam	Tai_Tham
Devanagari	Mandaic	Tai_Viet
Egyptian_Hieroglyphs	Meetei_Mayek	Takri
Ethiopic	Meroitic_Cursive	Tamil
Georgian	Meroitic_Hieroglyphs	Telugu
Glagolitic	Miao	Thaana
Gothic	Mongolian	Thai
Greek	Myanmar	Tibetan
Gujarati	New_Tai_Lue	Tifinagh
Gurmukhi	Nko	Ugaritic
Han	Ogham	Vai
Hangul	Ol_Chiki	Yi

**Hangul syllable type**
Abb.	Long form
L	Leading_Jamo
LV	LV_Syllable
LVT	LVT_Syllable
T	Trailing_Jamo
V	Vowel_Jamo

dchar lineSep;

Constant code point (0x2028) - line separator.

dchar paraSep;

Constant code point (0x2029) - paragraph separator.

template isCodepointSet(T)

Tests if T is some kind a set of code points. Intended for template constraints.

template isIntegralPair(T, V = uint)

Tests if T is a pair of integers that implicitly convert to V. The following code must compile for any pair T:

(T x){ V a = x[0]; V b = x[1];}

The following must not compile:

(T x){ V c = x[2];}

alias CodepointSet = InversionList!(GcPolicy).InversionList;

The recommended default type for set of code points. For details, see the current implementation: InversionList .

struct CodepointInterval;

The recommended type of std.typecons.Tuple to represent [a, b) intervals of code points. As used in InversionList . Any interval type should pass isIntegralPair trait.

struct InversionList(SP = GcPolicy);

InversionList is a set of code points represented as an array of open-right [a, b) intervals (see CodepointInterval above). The name comes from the way the representation reads left to right. For instance a set of all values [10, 50), [80, 90), plus a singular value 60 looks like this:

10, 50, 60, 61, 80, 90

The way to read this is: start with negative meaning that all numbers smaller then the next one are not present in this set (and positive - the contrary). Then switch positive/negative after each number passed from left to right.

This way negative spans until 10, then positive until 50, then negative until 60, then positive until 61, and so on. As seen this provides a space-efficient storage of highly redundant data that comes in long runs. A description which Unicode character properties fit nicely. The technique itself could be seen as a variation on RLE encoding .

Sets are value types (just like int is) thus they are never aliased.

Example:

auto a = CodepointSet('a', 'z'+1);
auto b = CodepointSet('A', 'Z'+1);
auto c = a;
a = a | b;
assert(a == CodepointSet('A', 'Z'+1, 'a', 'z'+1));
assert(a != c);

See also unicode for simpler construction of sets from predefined ones.

Memory usage is 6 bytes per each contiguous interval in a set. The value semantics are achieved by using the ($WEB http://en.wikipedia.org/wiki/Copy-on-write, COW) technique and thus it's not safe to cast this type to shared.

Note:

It's not recommended to rely on the template parameters or the exact type of a current code point set in std.uni. The type and parameters may change when the standard allocators design is finalized. Use isCodepointSet with templates or just stick with the default alias CodepointSet throughout the whole code base.

this(Set)(Set set) if (isCodepointSet!Set);

Construct from another code point set of any type.

this(Range)(Range intervals) if (isForwardRange!Range && isIntegralPair!(ElementType!Range));

Construct a set from a range of sorted code point intervals.

this()(uint[] intervals...);

Construct a set from plain values of sorted code point intervals.

Example:

auto set = CodepointSet('a', 'z'+1, 'а', 'я'+1);
foreach(v; 'a'..'z'+1)
    assert(set[v]);
// Cyrillic lowercase interval

foreach(v; 'а'..'я'+1)
    assert(set[v]);

@property auto byInterval();

Get range that spans all of the code point intervals in this InversionList .

Example:

import std.algorithm, std.typecons;
auto set = CodepointSet('A', 'D'+1, 'a', 'd'+1);
set.byInterval.equal([tuple('A', 'E'), tuple('a', 'e')]);

const bool opIndex(uint val);

Tests the presence of code point val in this set.

Example:

auto gothic = unicode.Gothic;
// Gothic letter ahsa

assert(gothic['\U00010330']);
// no ascii in Gothic obviously

assert(!gothic['$']);

size_t length();

Number of code points in this set

This opBinary(string op, U)(U rhs) if (isCodepointSet!U || is(U : dchar));

Sets support natural syntax for set algebra, namely:


Operator	Math notation	Description
&	a ∩ b	intersection
\|	a ∪ b	union
-	a ∖ b	subtraction
~	a ~ b	symmetric set difference i.e. (a ∪ b) \ (a ∩ b)

Example:

auto lower = unicode.LowerCase;
auto upper = unicode.UpperCase;
auto ascii = unicode.ASCII;

assert((lower & upper).empty); // no intersection

auto lowerASCII = lower & ascii;
assert(lowerASCII.byCodepoint.equal(iota('a', 'z'+1)));
// throw away all of the lowercase ASCII

assert((ascii - lower).length == 128 - 26);

auto onlyOneOf = lower ~ ascii;
assert(!onlyOneOf['Δ']); // not ASCII and not lowercase

assert(onlyOneOf['$']); // ASCII and not lowercase

assert(!onlyOneOf['a']); // ASCII and lowercase

assert(onlyOneOf['я']); // not ASCII but lowercase


// throw away all cased letters from ASCII

auto noLetters = ascii - (lower | upper);
assert(noLetters.length == 128 - 26*2);

This opOpAssign(string op, U)(U rhs) if (isCodepointSet!U || is(U : dchar));

The 'op=' versions of the above overloaded operators.

bool opBinaryRight(string op : "in", U)(U ch) if (is(U : dchar));

Tests the presence of codepoint ch in this set, the same as opIndex .

auto opUnary(string op : "!")();

Obtains a set that is the inversion of this set. See also inverted .

@property auto byCodepoint();

A range that spans each code point in this set.

Example:

import std.algorithm;
auto set = unicode.ASCII;
set.byCodepoint.equal(iota(0, 0x80));

void toString(scope void delegate(const(char)[]) sink);

Obtain textual representation of this set in from of open-right intervals and feed it to sink.

Used by various standard formatting facilities such as std.format.formattedWrite, std.stdio.write, std.stdio.writef, std.conv.to and others.

Example:

import std.conv;
assert(unicode.ASCII.to!string == "[0..128$(RPAREN)");

ref auto add()(uint a, uint b);

Add an interval [a, b) to this set.

Example:

CodepointSet someSet;
someSet.add('0', '5').add('A','Z'+1);
someSet.add('5', '9'+1);
assert(someSet['0']);
assert(someSet['5']);
assert(someSet['9']);
assert(someSet['Z']);

@property auto inverted();

Obtains a set that is the inversion of this set.

See the '!' opUnary for the same but using operators.

Example:

set = unicode.ASCII;
// union with the inverse gets all of the code points in the Unicode

assert((set | set.inverted).length == 0x110000);
// no intersection with the inverse

assert((set & set.inverted).empty);

string toSourceCode(string funcName = "");

Generates string with D source code of unary function with name of funcName taking a single dchar argument. If funcName is empty the code is adjusted to be a lambda function.

The function generated tests if the code point passed belongs to this set or not. The result is to be used with string mixin. The intended usage area is aggressive optimization via meta programming in parser generators and the like.

Note:

Use with care for relatively small or regular sets. It could end up being slower then just using multi-staged tables.

Example:

import std.stdio;

// construct set directly from [a, b$(RPAREN) intervals

auto set = CodepointSet(10, 12, 45, 65, 100, 200);
writeln(set);
writeln(set.toSourceCode("func"));

The above outputs something along the lines of:

bool func(dchar ch)
{
    if(ch < 45)
    {
        if(ch == 10 || ch == 11) return true;
        return false;
    }
    else if (ch < 65) return true;
    else
    {
        if(ch < 100) return false;
        if(ch < 200) return true;
        return false;
    }
}

const bool empty();

True if this set doesn't contain any code points.

Example:

CodepointSet emptySet;
assert(emptySet.length == 0);
assert(emptySet.empty);

template codepointSetTrie(sizes...) if (sumOfIntegerTuple!sizes == 21)

A shorthand for creating a custom multi-level fixed Trie from a CodepointSet. sizes are numbers of bits per level, with the most significant bits used first.

Note:

The sum of sizes must be equal 21.

template CodepointSetTrie(sizes...) if (sumOfIntegerTuple!sizes == 21)

Type of Trie generated by codepointSetTrie function.

template codepointTrie(T, sizes...) if (sumOfIntegerTuple!sizes == 21)

A slightly more general tool for building fixed Trie for the Unicode data.

Specifically unlike codepointSetTrie it's allows creating mappings of dchar to an arbitrary type T.

Note:

Overload taking CodepointSets will naturally convert only to bool mapping Tries.

Example:

// pick characters from the Greek script

auto set = unicode.Greek;

// a user-defined property (or an expensive function)

// that we want to look up

static uint luckFactor(dchar ch)
{
    // here we consider a character lucky

    // if its code point has a lot of identical hex-digits

    // e.g. arabic letter DDAL (\u0688) has a "luck factor" of 2

    ubyte[6] nibbles; // 6 4-bit chunks of code point

    uint value = ch;
    foreach(i; 0..6)
    {
        nibbles[i] = value & 0xF;
        value >>= 4;
    }
    uint luck;
    foreach(n; nibbles)
        luck = cast(uint)max(luck, count(nibbles[], n));
    return luck;
}

// only unsigned built-ins are supported at the moment

alias LuckFactor = BitPacked!(uint, 3);

// create a temporary associative array (AA)

LuckFactor[dchar] map;
foreach(ch; set.byCodepoint)
    map[ch] = luckFactor(ch);

// bits per stage are chosen randomly, fell free to optimize

auto trie = codepointTrie!(LuckFactor, 8, 5, 8)(map);

// from now on the AA is not needed

foreach(ch; set.byCodepoint)
    assert(trie[ch] == luckFactor(ch)); // verify

// CJK is not Greek, thus it has the default value

assert(trie['\u4444'] == 0);
// and here is a couple of quite lucky Greek characters:

// Greek small letter epsilon with dasia

assert(trie['\u1F11'] == 3);
// Ancient Greek metretes sign

assert(trie['\U00010181'] == 3);

template CodepointTrie(T, sizes...) if (sumOfIntegerTuple!sizes == 21)

Type of Trie as generated by codepointTrie function.

auto toTrie(size_t level, Set)(Set set) if (isCodepointSet!Set);

Note:: Level 4 stays very practical (being faster and more predictable) compared to using direct lookup on the set itself.

auto toDelegate(Set)(Set set) if (isCodepointSet!Set);

Builds a Trie with typically optimal speed-size trade-off and wraps it into a delegate of the following type: bool delegate(dchar ch).

Effectively this creates a 'tester' lambda suitable for algorithms like std.algorithm.find that take unary predicates.

See the Synopsis section for example.

struct unicode;

A single entry point to lookup Unicode code point sets by name or alias of a block, script or general category.

It uses well defined standard rules of property name lookup. This includes fuzzy matching of names, so that 'White_Space', 'white-SpAce' and 'whitespace' are all considered equal and yield the same set of white space characters.

static @property auto opDispatch(string name)();

Performs the lookup of set of code points with compile-time correctness checking. This short-cut version combines 3 searches: across blocks, scripts, and common binary properties.

Note that since scripts and blocks overlap the usual trick to disambiguate is used - to get a block use unicode.InBlockName, to search a script use unicode.ScriptName.

See also block , script and (not included in this search) hangulSyllableType .

Example:

auto ascii = unicode.ASCII;
assert(ascii['A']);
assert(ascii['~']);
assert(!ascii['\u00e0']);
// matching is case-insensitive

assert(ascii == unicode.ascII);
assert(!ascii['à']);
// underscores, '-' and whitespace in names are ignored too

auto latin = unicode.in_latin1_Supplement;
assert(latin['à']);
assert(!latin['$']);
// BTW Latin 1 Supplement is a block, hence "In" prefix

assert(latin == unicode("In Latin 1 Supplement"));
import std.exception;
// run-time look up throws if no such set is found

assert(collectException(unicode("InCyrilliac")));

static auto opCall(C)(in C[] name) if (is(C : dchar));

The same lookup across blocks, scripts, or binary properties, but performed at run-time. This version is provided for cases where name is not known beforehand; otherwise compile-time checked opDispatch is typically a better choice.

See the table of properties for available sets.

struct block;

Narrows down the search for sets of code points to all Unicode blocks.

struct script;

Narrows down the search for sets of code points to all Unicode scripts.

See the table of properties for available sets.

Example:

auto arabicScript = unicode.script.arabic;
auto arabicBlock = unicode.block.arabic;
// there is an intersection between script and block

assert(arabicBlock['؁']);
assert(arabicScript['؁']);
// but they are different

assert(arabicBlock != arabicScript);
assert(arabicBlock == unicode.inArabic);
assert(arabicScript == unicode.arabic);

struct hangulSyllableType;

Fetch a set of code points that have the given hangul syllable type.

Other non-binary properties (once supported) follow the same notation - unicode.propertyName.propertyValue for compile-time checked access and unicode.propertyName(propertyValue) for run-time checked one.

See the table of properties for available sets.

Example:

// L here is syllable type not Letter as in unicode.L short-cut

auto leadingVowel = unicode.hangulSyllableType("L");
// check that some leading vowels are present

foreach(vowel; '\u1110'..'\u115F')
    assert(leadingVowel[vowel]);
assert(leadingVowel == unicode.hangulSyllableType.L);

size_t graphemeStride(C)(in C[] input, size_t index) if (is(C : dchar));

Returns the length of grapheme cluster starting at index. Both the resulting length and the index are measured in code units.

Example:

// ASCII as usual is 1 code unit, 1 code point etc.

assert(graphemeStride("  ", 1) == 1);
// A + combing ring above

string city = "A\u030Arhus";
size_t first = graphemeStride(city, 0);
assert(first == 3); //\u030A has 2 UTF-8 code units

assert(city[0..first] == "A\u030A");
assert(city[first..$] == "rhus");

Grapheme decodeGrapheme(Input)(ref Input inp) if (isInputRange!Input && is(Unqual!(ElementType!Input) == dchar));

Note:: This function modifies inp and thus inp must be an L-value.

struct Grapheme;

A structure designed to effectively pack characters of a grapheme cluster.

Grapheme has value semantics so 2 copies of a Grapheme always refer to distinct objects. In most actual scenarios a Grapheme fits on the stack and avoids memory allocation overhead for all but quite long clusters.

Example:

import std.algorithm;
string bold = "ku\u0308hn";

// note that decodeGrapheme takes parameter by ref

// slicing a grapheme yields a range of dchar

assert(decodeGrapheme(bold)[].equal("k"));

// the next grapheme is 2 characters long

auto wideOne = decodeGrapheme(bold);
assert(wideOne.length == 2);
assert(wideOne[].equal("u\u0308"));

// the usual range manipulation is possible

assert(wideOne[].filter!isMark.equal("\u0308"));

See also decodeGrapheme , graphemeStride .

const pure nothrow @trusted dchar opIndex(size_t index);

Gets a code point at the given index in this cluster.

pure nothrow @trusted void opIndexAssign(dchar ch, size_t index);

Writes a code point ch at given index in this cluster.

Warning:

Use of this facility may invalidate grapheme cluster, see also Grapheme.valid .

Example:

auto g = Grapheme("A\u0302");
assert(g[0] == 'A');
assert(g.valid);
g[1] = '~'; // ASCII tilda is not a combining mark

assert(g[1] == '~');
assert(!g.valid);

pure nothrow @trusted auto opSlice(size_t a, size_t b);
pure nothrow @trusted auto opSlice();

Warning:: Invalidates when this Grapheme leaves the scope, attempts to use it then would lead to memory corruption.

const pure nothrow @property @trusted size_t length();

Grapheme cluster length in code points.

ref auto opOpAssign(string op)(dchar ch);

Append character ch to this grapheme.

Warning:

Use of this facility may invalidate grapheme cluster, see also valid.

Example:

auto g = Grapheme("A");
assert(g.valid);
g ~= '\u0301';
assert(g[].equal("A\u0301"));
assert(g.valid);
g ~= "B";
// not a valid grapheme cluster anymore

assert(!g.valid);
// still could be useful though

assert(g[].equal("A\u0301B"));

ref auto opOpAssign(string op, Input)(Input inp) if (isInputRange!Input && is(ElementType!Input : dchar));

Append all characters from the input range inp to this Grapheme.

bool valid()();

True if this object contains valid extended grapheme cluster. Decoding primitives of this module always return a valid Grapheme.

Appending to and direct manipulation of grapheme's characters may render it no longer valid. Certain applications may chose to use Grapheme as a "small string" of any code points and ignore this property entirely.

int sicmp(S1, S2)(S1 str1, S2 str2) if (isForwardRange!S1 && is(Unqual!(ElementType!S1) == dchar) && isForwardRange!S2 && is(Unqual!(ElementType!S2) == dchar));

Does basic case-insensitive comparison of strings str1 and str2. This function uses simpler comparison rule thus achieving better performance then icmp . However keep in mind the warning below.

Warning:

This function only handles 1:1 code point mapping and thus is not sufficient for certain alphabets like German, Greek and few others.

Example:

assert(sicmp("Август", "авгусТ") == 0);
// Greek also works as long as there is no 1:M mapping in sight

assert(sicmp("ΌΎ", "όύ") == 0);
// things like the following won't get matched as equal

// Greek small letter iota with dialytika and tonos

assert(sicmp("ΐ", "\u03B9\u0308\u0301") != 0);

// while icmp has no problem with that

assert(icmp("ΐ", "\u03B9\u0308\u0301") == 0);
assert(icmp("ΌΎ", "όύ") == 0);

int icmp(S1, S2)(S1 str1, S2 str2) if (isForwardRange!S1 && is(Unqual!(ElementType!S1) == dchar) && isForwardRange!S2 && is(Unqual!(ElementType!S2) == dchar));

Does case insensitive comparison of str1 and str2. Follows the rules of full case-folding mapping. This includes matching as equal german ß with "ss" and other 1:M code point mappings unlike sicmp . The cost of icmp being pedantically correct is slightly worse performance.

Example:

assert(icmp("Rußland", "Russland") == 0);
assert(icmp("ᾩ -> \u1F70\u03B9", "\u1F61\u03B9 -> ᾲ") == 0);

@trusted ubyte combiningClass(dchar ch);

Returns the combining class of ch.

Example:

// shorten the code

alias CC = combiningClass;

// combining tilda

assert(CC('\u0303') == 230);
// combining ring below

assert(CC('\u0325') == 220);
// the simple consequence is that  "tilda" should be

// placed after a "ring below" in a sequence

enum UnicodeDecomposition: int;

Unicode character decomposition type.

Canonical

Canonical decomposition. The result is canonically equivalent sequence.

Compatibility

Note:: Compatibility decomposition is a lossy conversion, typically suitable only for fuzzy matching and internal processing.

@trusted dchar compose(dchar first, dchar second);

Try to canonically compose 2 characters. Returns the composed character if they do compose and dchar.init otherwise.

The assumption is that first comes before second in the original text, usually meaning that the first is a starter.

Note:

Hangul syllables are not covered by this function. See composeJamo below.

Example:

assert(compose('A','\u0308') == '\u00C4');
assert(compose('A', 'B') == dchar.init);
assert(compose('C', '\u0301') == '\u0106');
// note that the starter is the first one

// thus the following doesn't compose

assert(compose('\u0308', 'A') == dchar.init);

Grapheme decompose(UnicodeDecomposition decompType = Canonical)(dchar ch);

Returns a full Canonical (by default) or Compatibility decomposition of character ch. If no decomposition is available returns a Grapheme with the ch itself.

Note:

This function also decomposes hangul syllables as prescribed by the standard. See also decomposeHangul for a restricted version that takes into account only hangul syllables but no other decompositions.

Example:

import std.algorithm;
assert(decompose('Ĉ')[].equal("C\u0302"));
assert(decompose('D')[].equal("D"));
assert(decompose('\uD4DC')[].equal("\u1111\u1171\u11B7"));
assert(decompose!Compatibility('¹').equal("1"));

@trusted Grapheme decomposeHangul(dchar ch);

Decomposes a Hangul syllable. If ch is not a composed syllable then this function returns Grapheme containing only ch as is.

Example:

import std.algorithm;
assert(decomposeHangul('\uD4DB')[].equal("\u1111\u1171\u11B6"));

@trusted dchar composeJamo(dchar lead, dchar vowel, dchar trailing = (dchar).init);

Try to compose hangul syllable out of a leading consonant (lead), a vowel and optional trailing consonant jamos.

On success returns the composed LV or LVT hangul syllable.

If any of lead and vowel are not a valid hangul jamo of the respective character class returns dchar.init.

Example:

assert(composeJamo('\u1111', '\u1171', '\u11B6') == '\uD4DB');
// leaving out T-vowel, or passing any codepoint

// that is not trailing consonant composes an LV-syllable

assert(composeJamo('\u1111', '\u1171') == '\uD4CC');
assert(composeJamo('\u1111', '\u1171', ' ') == '\uD4CC');
assert(composeJamo('\u1111', 'A') == dchar.init);
assert(composeJamo('A', '\u1171') == dchar.init);

enum NormalizationForm: int;

Enumeration type for normalization forms, passed as template parameter for functions like normalize .

NFC
NFD
NFKC
NFKD

Shorthand aliases from values indicating normalization forms.

inout(C)[] normalize(NormalizationForm norm = NFC, C)(inout(C)[] input);

Returns input string normalized to the chosen form. Form C is used by default.

For more information on normalization forms see the normalization section.

Note:

In cases where the string in question is already normalized, it is returned unmodified and no memory allocation happens.

Example:

// any encoding works

wstring greet = "Hello world";
assert(normalize(greet) is greet); // the same exact slice


// An example of a character with all 4 forms being different:

// Greek upsilon with acute and hook symbol (code point 0x03D3)

assert(normalize!NFC("ϓ") == "\u03D3");
assert(normalize!NFD("ϓ") == "\u03D2\u0301");
assert(normalize!NFKC("ϓ") == "\u038E");
assert(normalize!NFKD("ϓ") == "\u03A5\u0301");

bool allowedIn(NormalizationForm norm)(dchar ch);

Tests if dchar ch is always allowed (Quick_Check=YES) in normalization form norm.

// e.g. Cyrillic is always allowed, so is ASCII

assert(allowedIn!NFC('я'));
assert(allowedIn!NFD('я'));
assert(allowedIn!NFKC('я'));
assert(allowedIn!NFKD('я'));
assert(allowedIn!NFC('Z'));

pure nothrow @safe bool isWhite(dchar c);

Whether or not c is a Unicode whitespace character. (general Unicode category: Part of C0(tab, vertical tab, form feed, carriage return, and linefeed characters), Zs, Zl, Zp, and NEL(U+0085))

pure nothrow @safe bool isLower(dchar c);

Return whether c is a Unicode lowercase character.

pure nothrow @safe bool isUpper(dchar c);

Return whether c is a Unicode uppercase character.

pure nothrow @safe dchar toLower(dchar c);

Warning:: certain alphabets like German and Greek have no 1:1 upper-lower mapping. Use overload of toLower which takes full string instead.

pure @trusted void toLowerInPlace(C)(ref C[] s) if (is(C == char) || is(C == wchar) || is(C == dchar));

Converts s to lowercase (by performing Unicode lowercase mapping) in place. For a few characters string length may increase after the transformation, in such a case the function reallocates exactly once. If s does not have any uppercase characters, then s is unaltered.

pure @trusted void toUpperInPlace(C)(ref C[] s) if (is(C == char) || is(C == wchar) || is(C == dchar));

Converts s to uppercase (by performing Unicode uppercase mapping) in place. For a few characters string length may increase after the transformation, in such a case the function reallocates exactly once. If s does not have any lowercase characters, then s is unaltered.

pure @trusted S toLower(S)(S s) if (isSomeString!S);

Returns a string which is identical to s except that all of its characters are converted to lowercase (by preforming Unicode lowercase mapping). If none of s characters were affected, then s itself is returned.

pure nothrow @safe dchar toUpper(dchar c);

Warning:: Certain alphabets like German and Greek have no 1:1 upper-lower mapping. Use overload of toUpper which takes full string instead.

pure @trusted S toUpper(S)(S s) if (isSomeString!S);

Returns a string which is identical to s except that all of its characters are converted to uppercase (by preforming Unicode uppercase mapping). If none of s characters were affected, then s itself is returned.

pure nothrow @safe bool isAlpha(dchar c);

Returns whether c is a Unicode alphabetic character (general Unicode category: Alphabetic).

pure nothrow @safe bool isMark(dchar c);

Returns whether c is a Unicode mark (general Unicode category: Mn, Me, Mc).

pure nothrow @safe bool isNumber(dchar c);

Returns whether c is a Unicode numerical character (general Unicode category: Nd, Nl, No).

pure nothrow @safe bool isPunctuation(dchar c);

Returns whether c is a Unicode punctuation character (general Unicode category: Pd, Ps, Pe, Pc, Po, Pi, Pf).

pure nothrow @safe bool isSymbol(dchar c);

Returns whether c is a Unicode symbol character (general Unicode category: Sm, Sc, Sk, So).

pure nothrow @safe bool isSpace(dchar c);

Note:: This doesn't include '\n', '\r', \t' and other non-space character. For commonly used less strict semantics see isWhite .

pure nothrow @safe bool isGraphical(dchar c);

Returns whether c is a Unicode graphical character (general Unicode category: L, M, N, P, S, Zs).

pure nothrow @safe bool isControl(dchar c);

Returns whether c is a Unicode control character (general Unicode category: Cc).

pure nothrow @safe bool isFormat(dchar c);

Returns whether c is a Unicode formatting character (general Unicode category: Cf).

pure nothrow @safe bool isPrivateUse(dchar c);

Returns whether c is a Unicode Private Use code point (general Unicode category: Co).

pure nothrow @safe bool isSurrogate(dchar c);

Returns whether c is a Unicode surrogate code point (general Unicode category: Cs).

pure nothrow @safe bool isSurrogateHi(dchar c);

Returns whether c is a Unicode high surrogate (lead surrogate).

pure nothrow @safe bool isSurrogateLo(dchar c);

Returns whether c is a Unicode low surrogate (trail surrogate).

pure nothrow @safe bool isNonCharacter(dchar c);

Returns whether c is a Unicode non-character i.e. a code point with no assigned abstract character. (general Unicode category: Cn)

License, Boost License 1.0.

dchar lineSep;

dchar paraSep;

template isCodepointSet(T)

template isIntegralPair(T, V = uint)

alias CodepointSet = InversionList!(GcPolicy).InversionList;

struct CodepointInterval;

struct InversionList(SP = GcPolicy);

this(Set)(Set set) if (isCodepointSet!Set);

this(Range)(Range intervals) if (isForwardRange!Range && isIntegralPair!(ElementType!Range));

this()(uint[] intervals...);

@property auto byInterval();

const bool opIndex(uint val);

size_t length();

This opBinary(string op, U)(U rhs) if (isCodepointSet!U || is(U : dchar));

This opOpAssign(string op, U)(U rhs) if (isCodepointSet!U || is(U : dchar));

bool opBinaryRight(string op : "in", U)(U ch) if (is(U : dchar));

auto opUnary(string op : "!")();

@property auto byCodepoint();

void toString(scope void delegate(const(char)[]) sink);

ref auto add()(uint a, uint b);

@property auto inverted();

string toSourceCode(string funcName = "");

const bool empty();

template codepointSetTrie(sizes...) if (sumOfIntegerTuple!sizes == 21)

See Also, toTrie , which is even simpler.

template CodepointSetTrie(sizes...) if (sumOfIntegerTuple!sizes == 21)

template codepointTrie(T, sizes...) if (sumOfIntegerTuple!sizes == 21)

template CodepointTrie(T, sizes...) if (sumOfIntegerTuple!sizes == 21)

auto toTrie(size_t level, Set)(Set set) if (isCodepointSet!Set);

auto toDelegate(Set)(Set set) if (isCodepointSet!Set);

struct unicode;

static @property auto opDispatch(string name)();

static auto opCall(C)(in C[] name) if (is(C : dchar));

struct block;

struct script;

struct hangulSyllableType;

size_t graphemeStride(C)(in C[] input, size_t index) if (is(C : dchar));

Grapheme decodeGrapheme(Input)(ref Input inp) if (isInputRange!Input && is(Unqual!(ElementType!Input) == dchar));

struct Grapheme;

const pure nothrow @trusted dchar opIndex(size_t index);

pure nothrow @trusted void opIndexAssign(dchar ch, size_t index);

pure nothrow @trusted auto opSlice(size_t a, size_t b); pure nothrow @trusted auto opSlice();

const pure nothrow @property @trusted size_t length();

ref auto opOpAssign(string op)(dchar ch);

ref auto opOpAssign(string op, Input)(Input inp) if (isInputRange!Input && is(ElementType!Input : dchar));

bool valid()();

int sicmp(S1, S2)(S1 str1, S2 str2) if (isForwardRange!S1 && is(Unqual!(ElementType!S1) == dchar) && isForwardRange!S2 && is(Unqual!(ElementType!S2) == dchar));

int icmp(S1, S2)(S1 str1, S2 str2) if (isForwardRange!S1 && is(Unqual!(ElementType!S1) == dchar) && isForwardRange!S2 && is(Unqual!(ElementType!S2) == dchar));

@trusted ubyte combiningClass(dchar ch);

enum UnicodeDecomposition: int;

@trusted dchar compose(dchar first, dchar second);

Grapheme decompose(UnicodeDecomposition decompType = Canonical)(dchar ch);

@trusted Grapheme decomposeHangul(dchar ch);

@trusted dchar composeJamo(dchar lead, dchar vowel, dchar trailing = (dchar).init);

enum NormalizationForm: int;

inout(C)[] normalize(NormalizationForm norm = NFC, C)(inout(C)[] input);

bool allowedIn(NormalizationForm norm)(dchar ch);

pure nothrow @safe bool isWhite(dchar c);

pure nothrow @safe bool isLower(dchar c);

pure nothrow @safe bool isUpper(dchar c);

pure nothrow @safe dchar toLower(dchar c);

pure @trusted void toLowerInPlace(C)(ref C[] s) if (is(C == char) || is(C == wchar) || is(C == dchar));

pure @trusted void toUpperInPlace(C)(ref C[] s) if (is(C == char) || is(C == wchar) || is(C == dchar));

pure @trusted S toLower(S)(S s) if (isSomeString!S);

pure nothrow @safe dchar toUpper(dchar c);

pure @trusted S toUpper(S)(S s) if (isSomeString!S);

pure nothrow @safe bool isAlpha(dchar c);

pure nothrow @safe bool isMark(dchar c);

pure nothrow @safe bool isNumber(dchar c);

pure nothrow @safe bool isPunctuation(dchar c);

pure nothrow @safe bool isSymbol(dchar c);

pure nothrow @safe bool isSpace(dchar c);

pure nothrow @safe bool isGraphical(dchar c);

pure nothrow @safe bool isControl(dchar c);

pure nothrow @safe bool isFormat(dchar c);

pure nothrow @safe bool isPrivateUse(dchar c);

pure nothrow @safe bool isSurrogate(dchar c);

pure nothrow @safe bool isSurrogateHi(dchar c);

pure nothrow @safe bool isSurrogateLo(dchar c);

pure nothrow @trusted auto opSlice(size_t a, size_t b);
pure nothrow @trusted auto opSlice();