import * as React from 'react'
  /* @jsx mdx */
import { mdx } from '@mdx-js/react';
/* @jsxRuntime classic */

/* @jsx mdx */

export const _frontmatter = {
  "title": "Unicode & emoji 🚀",
  "date": "2021-01-20T13:00:00.000Z",
  "layout": "post",
  "draft": false,
  "path": "/posts/unicode-emoji/",
  "category": "Software",
  "tags": ["emoji", "unicode", "python"],
  "description": "An example-rich introduction to Unicode and how emoji are represented using codepoints."
};
const layoutProps = {
  _frontmatter
};
const MDXLayout = "wrapper";
export default function MDXContent({
  components,
  ...props
}) {
  return <MDXLayout {...layoutProps} {...props} components={components} mdxType="MDXLayout">
    <h2>{`Unicode Codepoints`}</h2>
    <p><em parentName="p">{`Unicode codepoints`}</em>{` are used to represent characters. A codepoint is just a number.
Every displayable character is represented by a sequence of one `}<strong parentName="p">{`or more`}</strong>{` codepoints.`}</p>
    <p>{`However, not every codepoint corresponds to a character. Some codepoints are non-printable, and instead function as "modifiers" (for example, joining characters together or
switching to "right-to-left" text mode).`}</p>
    <p>{`The characters we use in English are generally represented using a single codepoint. For example, the codepoint which for `}<inlineCode parentName="p">{`"a"`}</inlineCode>{` is the `}<inlineCode parentName="p">{`97`}</inlineCode>{` (base 10).`}</p>
    <p>{`Typically, we convert the represent the codepoint as hexadecimal (base 16) instead of base 10. So, `}<inlineCode parentName="p">{`"a"`}</inlineCode>{` is represented by the codepoint `}<inlineCode parentName="p">{`0x61`}</inlineCode>{`. Sometimes you'll see them written like `}<inlineCode parentName="p">{`U+0061`}</inlineCode>{`.`}</p>
    <p>{`When writing code in Python or JavaScript we can write the codepoint like `}<inlineCode parentName="p">{`"\\u0061"`}</inlineCode>{`. `}<inlineCode parentName="p">{`"\\u0061"`}</inlineCode>{` represents a single character, and can be
used in a string and will behave just like the character `}<inlineCode parentName="p">{`"a"`}</inlineCode>{`.`}</p>
    <pre><code parentName="pre" {...{
        "className": "language-python"
      }}>{`>>> "a" == "\\u0061"
True

>>> "bab" == "b\\u0061b"
True
`}</code></pre>
    <p>{`In Python, for codepoints greater than `}<inlineCode parentName="p">{`"0xFFFF`}</inlineCode>{` we need to use a capital `}<inlineCode parentName="p">{`U`}</inlineCode>{` and pad the width of the literal to 8 characters. For example, we'd write the codepoint `}<inlineCode parentName="p">{`0x10a00`}</inlineCode>{` as `}<inlineCode parentName="p">{`"\\U00010a00"`}</inlineCode>{`.`}</p>
    <h2>{`Converting between codepoints and characters`}</h2>
    <p>{`In Python, we can convert a character to a codepoint using the built-in `}<inlineCode parentName="p">{`ord`}</inlineCode>{` function.`}</p>
    <pre><code parentName="pre" {...{
        "className": "language-python"
      }}>{`>>> ord("a")
97
`}</code></pre>
    <p>{`Conversely, given a codepoint, we can find out which character it represents using the built-in `}<inlineCode parentName="p">{`chr`}</inlineCode>{` function.`}</p>
    <pre><code parentName="pre" {...{
        "className": "language-python"
      }}>{`>>> chr(97)
"a"
>>> chr(0x61)
'a'
`}</code></pre>
    <p>{`That is, `}<inlineCode parentName="p">{`chr`}</inlineCode>{` and `}<inlineCode parentName="p">{`ord`}</inlineCode>{` are the inverse of each other.`}</p>
    <p>{`However, this approach isn't recommended. For one, it implies that characters correspond to a single codepoint. In reality, characters are often represented by multiple codepoints.
This is particularly true for emoji and for East Asian languages such as Chinese, Japanese, and Korean (in i18n terms these languages are often referred to as "CJK").`}</p>
    <p>{`Look what happens when we ask for the codepoint corresponding to "`}<strong parentName="p">{`é`}</strong>{`":`}</p>
    <pre><code parentName="pre" {...{
        "className": "language-python"
      }}>{`>>> ord("é")
Traceback (most recent call last)
TypeError: ord() expected a character, but string of length 2 found
`}</code></pre>
    <p>{`As hinted at in the exception message, the character "é" actually consists of `}<strong parentName="p">{`two`}</strong>{` codepoints, so `}<inlineCode parentName="p">{`ord`}</inlineCode>{` raises a `}<inlineCode parentName="p">{`TypeError`}</inlineCode>{`.
Let's look at what's happening:`}</p>
    <pre><code parentName="pre" {...{
        "className": "language-python"
      }}>{`>>> len("é")  # Looks like 'é' is indeed 2 codepoints
2

>>> for codepoint in "é":  # Lets look at the 2 codepoints it consists of
...     print(codepoint)

e
 ́
>>> for codepoint in "é":  # Look up the two codepoints
...     print(ord(codepoint))

101
769
`}</code></pre>
    <p>{`The `}<inlineCode parentName="p">{`len`}</inlineCode>{` function returns the number of codepoints the string contains (`}<em parentName="p">{`not`}</em>{` the number of bytes or the number of glpyhs that appear on screen when printed).`}</p>
    <p>{`If we iterate through a string in Python, we're actually iterating over the codepoints that make up the string.`}</p>
    <p>{`From the example above, we can see that the character `}<strong parentName="p">{`"é"`}</strong>{` consists of two codepoints: `}<strong parentName="p">{`"e"`}</strong>{` (101), and `}<strong parentName="p">{`" ́"`}</strong>{` (769).`}</p>
    <h2>{`Canonical equivalence`}</h2>
    <p>{`Visually identical characters can even be represented as different sequences of codepoints.`}</p>
    <p>{`For example, consider the character "`}<strong parentName="p">{`ü`}</strong>{`". This character can be represented in two different ways:`}</p>
    <pre><code parentName="pre" {...{
        "className": "language-python"
      }}>{`>>> print("\\u00FC")  # As a single codepoint
ü

>>> print("\\u0075\\u0308")  # Multiple codepoints
ü

>>> "ü" == "ü"  # Visually identical, but the codepoints differ
False

>>> "\\u00FC" == "\\u0075\\u0308"  # Exact same check as above
False

>>> len("ü")  # Single codepoint version
1

>>> len("ü")  # Two codepoint version
2
`}</code></pre>
    <p>{`This probably isn't the behaviour we'd expect. In reality, we'd  want to treat "ü" and "ü" as being the same character.`}</p>
    <p>{`We can get around this using `}<em parentName="p">{`normalisation`}</em>{`. The Python standard library comes to the rescue here with the `}<inlineCode parentName="p">{`unicodedata`}</inlineCode>{` module.`}</p>
    <p>{`By normalising two strings into a canonical representation, we can check their equivalence as expected.`}</p>
    <pre><code parentName="pre" {...{
        "className": "language-python"
      }}>{`>>> normalised = unicodedata.normalize("NFC", "\\u0075\\u0308")
>>> other_normalised = unicodedata.normalize("NFC", "\\u00FC")
>>> normalised == other_normalised
True
`}</code></pre>
    <p>{`You'll want to normalise Unicode strings at the boundary of your system (as early as possible!) to ensure that you're always dealing with the canonical representation.`}</p>
    <h2>{`Emoji`}</h2>
    <p>{`Around half of all emoji correspond to a single Unicode codepoint. For example, 🙂 is represented by the codepoint `}<inlineCode parentName="p">{`"\\U0001F642"`}</inlineCode>{`.`}</p>
    <pre><code parentName="pre" {...{
        "className": "language-python"
      }}>{`>>> print("\\U0001F642")
🙂
`}</code></pre>
    <p>{`The rest are represented by sequences of codepoints.`}</p>
    <h3>{`Combining emoji (👩 + 🎨 = 👩‍🎨)`}</h3>
    <h4>{`Zero-Width Joiners`}</h4>
    <p>{`What happens when you "combine" the emoji for "woman" (`}<inlineCode parentName="p">{`"👩" == "\\U0001F469"`}</inlineCode>{`) with the emoji for "artist palette" (`}<inlineCode parentName="p">{`"🎨" == "\\U0001F3A8"`}</inlineCode>{`)?`}</p>
    <p>{`You get a "woman artist" 👩‍🎨 , of course!`}</p>
    <p>{`👩 + 🎨 = 👩‍🎨!`}</p>
    <p><em parentName="p">{`But how do we combine emoji codepoint sequences in this way?`}</em></p>
    <p>{`Here's a hint. When we do `}<inlineCode parentName="p">{`len("👩‍🎨")`}</inlineCode>{` the result is `}<inlineCode parentName="p">{`3`}</inlineCode>{`. We already know that "woman" and "artist palette" are represented by one codepoint each, so there must be another codepoint in there.`}</p>
    <p>{`A `}<em parentName="p">{`zero width joiner (or zwidge/ZWJ)`}</em>{` is a Unicode codepoint (`}<inlineCode parentName="p">{`0x200d`}</inlineCode>{`) used to combine the definitions of codepoints that appear at each side of it. It has no visual representation and takes up no space (although implementations `}<em parentName="p">{`may`}</em>{` have it take a small amount of space):`}</p>
    <pre><code parentName="pre" {...{
        "className": "language-python"
      }}>{`>>> print("x\\u200dx")
x‍x
`}</code></pre>
    <p>{`The zwidge is used to combine two emoji codepoints into a single glyph. We take the codepoints at each side, and combine them into one. So, to combine the "woman" and "artist palette" emoji we
just need to place a zwidge `}<code><span style={{
          "backgroundColor": "lightgray"
        }}>{`\\u200D`}</span></code>{` between them:`}</p>
    <pre><code parentName="pre" {...{
        "className": "language-python"
      }}>{`>>> woman = "👩"
>>> zwidge = "\\u200d"
>>> artist_palette = "🎨"
>>> print(woman + zwidge + artist_palette)
👩‍🎨
`}</code></pre>
    <p>{`This idea can be extended to even more complex emoji. The "family" emoji make for good examples:`}</p>
    <ul>
      <li parentName="ul"><strong parentName="li">{`Family: Woman, Girl`}</strong>{`: 👩 + ZWJ + 👧 = 👩‍👧`}</li>
      <li parentName="ul"><strong parentName="li">{`Family: Man, Girl, Boy`}</strong>{`: 👨 + ZWJ + 👧 + ZWJ + 👦 = 👨‍👧‍👦`}</li>
    </ul>
    <p>{`By placing a ZWJ between each of the constituent members of a family, we produce a single emoji representing the combined family. `}</p>
    <p>{`To write out the distinct codepoints that form the "`}<strong parentName="p">{`Family: Man, Girl, Boy`}</strong>{`"
emoji, we would do `}<code>{`"\\U0001F468`}<span style={{
          "backgroundColor": "lightgray"
        }}>{`\\u200D`}</span>{`\\U0001F467`}<span style={{
          "backgroundColor": "lightgray"
        }}>{`\\u200D`}</span>{`\\U0001F466"`}</code>{`.`}</p>
    <p>{`This, of course, can be printed as you would expect:`}</p>
    <pre><code parentName="pre" {...{
        "className": "language-python"
      }}>{`>>> print("\\U0001F468\\u200D\\U0001F467\\u200D\\U0001F466")
👨‍👧‍👦
`}</code></pre>
    <p>{`There are over 1000 different combinations of emojis you can construct using zero-width joiners as of Unicode 14.0. You can see them all `}<a parentName="p" {...{
        "href": "https://unicode.org/Public/emoji/14.0/emoji-zwj-sequences.txt"
      }}>{`here`}</a>{`.`}</p>
    <p>{`Note that zero-width joiners are only required where the second codepoint is not a combining character or modifier.
Recall that in our `}<inlineCode parentName="p">{`\\u0075\\u0308`}</inlineCode>{` (ü) example above we didn't need a ZWJ.
That's because the accent (`}<inlineCode parentName="p">{`\\u0308`}</inlineCode>{`) is a combining character by default. It's not intended to exist in isolation.`}</p>
    <h4>{`Modifying skin tone`}</h4>
    <p>{`Another example of this is emoji modifier codepoints which modify skin colour:`}</p>
    <ul>
      <li parentName="ul">{`Person: 🧑 `}<inlineCode parentName="li">{`U+1F9D1`}</inlineCode></li>
      <li parentName="ul">{`Medium-dark skin tone: `}<inlineCode parentName="li">{`U+1F3FE`}</inlineCode></li>
    </ul>
    <p>{`Put them together (no ZWJ needed!) and you get... 🧑🏾 a person with medium-dark skin tone!`}</p>
    <p>{`Skin tone modifiers work as part of more complex emojis too. As long as skin is being shown in the emoji, the modifier will (generally) work.`}</p>
    <p>{`For example, take the woman technologist emoji 👩‍💻, consisting of the codepoints `}<inlineCode parentName="p">{`U+1F469`}</inlineCode>{`, `}<inlineCode parentName="p">{`U+200D`}</inlineCode>{`, `}<inlineCode parentName="p">{`U+1F4BB`}</inlineCode>{`.`}</p>
    <p>{`The first codepoint represents a woman emoji. If we place a skin tone modifier after it, we can adjust the skin tone of the woman behind the computer.`}</p>
    <pre><code parentName="pre" {...{
        "className": "language-python"
      }}>{`skin_modifiers = ["", "\\U0001F3FB", "\\U0001F3FD", "\\U0001F3FF"]
for mod in skin_modifiers:
    emoji = f"\\U0001F469{mod}\\u200d\\U0001F4BB"
    print(emoji)

👩‍💻
👩🏻‍💻
👩🏽‍💻
👩🏿‍💻
`}</code></pre>
    <h4>{`Country flags`}</h4>
    <p>{`Country flags are, in general, handled by two special codepoints.`}</p>
    <p>{`Each codepoint represents a letter in that country's code. These codepoints are called "Regional Indicator Symbols" and are different from their
ASCII counterparts. Put these symbols together and you have a "Regional Indicator Pair"!`}</p>
    <p>{`Let's take the flag of Japan 🇯🇵 for example. The country code for Japan is "JP".`}</p>
    <p>{`To create the flag for Japan from codepoints, we'd take the use the Regional Indicator symbol for "J" (`}<inlineCode parentName="p">{`U+1F1EF`}</inlineCode>{`), and the regional indicator symbol
for "P" (`}<inlineCode parentName="p">{`U+1F1F5`}</inlineCode>{`).`}</p>
    <pre><code parentName="pre" {...{
        "className": "language-python"
      }}>{`>>> j = "\\U0001F1EF"
>>> p = "\\U0001F1F5"
>>> print(j)
🇯
>>> print(p)
🇵
>>> print(j + p)
🇯🇵
`}</code></pre>
    <p>{`Although the vast majority of flags work like this, there are some exceptions (particularly around flags that were added to Unicode in recent years).`}</p>
    <p>{`For example, the flag of Scotland 🏴󠁧󠁢󠁳󠁣󠁴󠁿 consists of a sequence of 7 (SEVEN) codepoints called an "Emoji Tag Sequence."`}</p>
    <h2>{`References & useful links`}</h2>
    <ul>
      <li parentName="ul"><a parentName="li" {...{
          "href": "https://emojipedia.org"
        }}>{`Emojipedia`}</a></li>
      <li parentName="ul"><a parentName="li" {...{
          "href": "https://unicode.org"
        }}>{`Unicode.org`}</a></li>
    </ul>
    <hr />
    <p><strong parentName="p">{`Related:`}</strong>{` See `}<a parentName="p" {...{
        "href": "https://engineering.atspotify.com/2013/06/18/creative-usernames/"
      }}>{`this blog post external link`}</a>{` from Spotify in 2013 which describes an account hijacking
attack which exploited a in Unicode normalisation (tl;dr: they support Unicode usernames, and two usernames could be normalised to the same string).`}</p>

    </MDXLayout>;
}
;
MDXContent.isMDXComponent = true;
      