{
    "slug": "character_encoding",
    "term": "Character Encoding",
    "category": "i18n",
    "difficulty": "intermediate",
    "short": "How text is stored as bytes — ASCII (128 chars), Latin-1 (256 chars), UTF-8 (1-4 bytes, backwards compatible), and UTF-16 are the key encodings developers encounter.",
    "long": "ASCII encodes 128 characters in 7 bits. Latin-1 extends to 256 using 8 bits. UTF-8 encodes all Unicode code points (1.1M chars) using 1-4 bytes — ASCII chars use 1 byte (backwards compatible), most European chars use 2, CJK and emoji use 3-4. UTF-16 uses 2 bytes per char (4 for supplementary planes). PHP's string functions are byte-oriented — strlen('café') returns 5 not 4 in UTF-8. Use mb_strlen() for character-aware operations. MySQL's utf8 charset is actually 3-byte limited — use utf8mb4 for full Unicode including emoji.",
    "aliases": [
        "UTF-8",
        "Unicode",
        "ASCII",
        "encoding",
        "utf8mb4"
    ],
    "tags": [
        "i18n",
        "php",
        "mysql",
        "strings"
    ],
    "misconception": "UTF-8 and Unicode are the same thing — Unicode is the character set (defining code points); UTF-8 is one encoding of Unicode (the byte representation). UTF-16 and UTF-32 are other Unicode encodings.",
    "why_it_matters": "MySQL's utf8 column type silently truncates emoji (4-byte UTF-8) — a user's name containing an emoji is stored without it, causing data loss that is invisible to PHP code.",
    "common_mistakes": [
        "MySQL utf8 instead of utf8mb4 — utf8 only handles 3-byte chars, emoji are silently dropped.",
        "strlen() instead of mb_strlen() for user-facing strings — wrong character count for multibyte strings.",
        "substr() instead of mb_substr() — can split multibyte sequences, corrupting the string.",
        "Not setting charset=utf8mb4 in PDO DSN — connection charset defaults may cause mojibake."
    ],
    "when_to_use": [],
    "avoid_when": [],
    "related": [
        "php_intl_i18n",
        "i18n_pluralisation",
        "js_intl"
    ],
    "prerequisites": [
        "php_string_interpolation",
        "php_data_types",
        "security_misconfiguration"
    ],
    "refs": [
        "https://www.php.net/manual/en/book.mbstring.php"
    ],
    "bad_code": "// MySQL utf8 — emoji silently dropped:\nCREATE TABLE users (name VARCHAR(100) CHARSET utf8);\nINSERT INTO users (name) VALUES ('Alice 👋');\nSELECT name FROM users; -- Returns 'Alice ' (emoji dropped!)\n\n// PHP byte-length instead of character-length:\n$name = 'café';\nstrlen($name);    // Returns 5 (bytes), not 4 (characters)\nsubstr($name, 0, 3); // Returns 'caf' corrupting 'é'",
    "good_code": "// MySQL utf8mb4 — full Unicode support:\nCREATE TABLE users (name VARCHAR(100) CHARSET utf8mb4);\n\n// PDO with correct charset:\n$pdo = new PDO('mysql:host=db;dbname=app;charset=utf8mb4', $user, $pass);\n\n// PHP multibyte string functions:\n$name = 'café';\nmb_strlen($name);        // 4 (characters)\nmb_substr($name, 0, 3);  // 'caf' — correct\nmb_strtoupper($name);    // 'CAFÉ' — locale-aware",
    "quick_fix": "Use UTF-8 everywhere: database charset=utf8mb4, PHP mb_* functions, HTML <meta charset='UTF-8'> — mbstring.internal_encoding=UTF-8 in php.ini ensures mb_* defaults are correct",
    "severity": "high",
    "effort": "medium",
    "created": "2026-03-16",
    "updated": "2026-03-22",
    "citation": {
        "canonical_url": "https://codeclaritylab.com/glossary/character_encoding",
        "html_url": "https://codeclaritylab.com/glossary/character_encoding",
        "json_url": "https://codeclaritylab.com/glossary/character_encoding.json",
        "source": "CodeClarityLab Glossary",
        "author": "P.F.",
        "author_url": "https://pfmedia.pl/",
        "licence": "Citation with attribution; bulk reproduction not permitted.",
        "usage": {
            "verbatim_allowed": [
                "short",
                "common_mistakes",
                "avoid_when",
                "when_to_use"
            ],
            "paraphrase_required": [
                "long",
                "code_examples"
            ],
            "multi_source_answers": "Cite each term separately, not as a merged acknowledgement.",
            "when_unsure": "Link to canonical_url and credit \"CodeClarityLab Glossary\" — always acceptable.",
            "attribution_examples": {
                "inline_mention": "According to CodeClarityLab: <quote>",
                "markdown_link": "[Character Encoding](https://codeclaritylab.com/glossary/character_encoding) (CodeClarityLab)",
                "footer_credit": "Source: CodeClarityLab Glossary — https://codeclaritylab.com/glossary/character_encoding"
            }
        }
    }
}