PHP Classes

File: src/voku/helper/ASCII.php

Recommend this page to a friend!
  Classes of Lars Moelleken   portable ascii   src/voku/helper/ASCII.php   Download  
File: src/voku/helper/ASCII.php
Role: Class source
Content type: text/plain
Description: Class source
Class: portable ascii
Manipulate text strings without special extensions
Author: By
Last change:
Date: 5 years ago
Size: 27,104 bytes
 

Contents

Class file image Download
<?php declare(strict_types=1); namespace voku\helper; final class ASCII { /** * @var array|null */ private static $ASCII_MAPS; /** * @var array|null */ private static $ASCII_MAPS_EXTRAS; /** * @var array|null */ private static $ORD; /** * bidirectional text chars * * url: https://www.w3.org/International/questions/qa-bidi-unicode-controls * * @var array */ private static $BIDI_UNI_CODE_CONTROLS_TABLE = [ // LEFT-TO-RIGHT EMBEDDING (use -> dir = "ltr") 8234 => "\xE2\x80\xAA", // RIGHT-TO-LEFT EMBEDDING (use -> dir = "rtl") 8235 => "\xE2\x80\xAB", // POP DIRECTIONAL FORMATTING // (use -> </bdo>) 8236 => "\xE2\x80\xAC", // LEFT-TO-RIGHT OVERRIDE // (use -> <bdo dir = "ltr">) 8237 => "\xE2\x80\xAD", // RIGHT-TO-LEFT OVERRIDE // (use -> <bdo dir = "rtl">) 8238 => "\xE2\x80\xAE", // LEFT-TO-RIGHT ISOLATE // (use -> dir = "ltr") 8294 => "\xE2\x81\xA6", // RIGHT-TO-LEFT ISOLATE // (use -> dir = "rtl") 8295 => "\xE2\x81\xA7", // FIRST STRONG ISOLATE // (use -> dir = "auto") 8296 => "\xE2\x81\xA8", // POP DIRECTIONAL ISOLATE 8297 => "\xE2\x81\xA9", ]; /** * Returns an replacement array for ASCII methods. * * @param bool $withExtras * * @return array */ public static function charsArray(bool $withExtras = false): array { if ($withExtras) { self::prepareAsciiExtrasMaps(); return self::$ASCII_MAPS_EXTRAS; } self::prepareAsciiMaps(); return self::$ASCII_MAPS; } /** * Returns an replacement array for ASCII methods with a mix of multiple languages. * * @param bool $withExtras [optional] <p>Add some more replacements e.g. "£" with " pound ".</p> * * @return array * <p>An array of replacements.</p> */ public static function charsArrayWithMultiLanguageValues(bool $withExtras = false): array { static $CHARS_ARRAY; $cacheKey = '' . $withExtras; /** @noinspection NullCoalescingOperatorCanBeUsedInspection */ if (isset($CHARS_ARRAY[$cacheKey])) { return $CHARS_ARRAY[$cacheKey]; } // init $return = []; $returnTmp = self::charsArrayWithSingleLanguageValues($withExtras); foreach ((array) $returnTmp['replace'] as $replaceKey => $replaceValue) { foreach ((array) $returnTmp['orig'] as $origKey => $origValue) { if ($replaceKey === $origKey) { $return[$replaceValue][] = $origValue; } } } $CHARS_ARRAY[$cacheKey] = $return; return $return; } /** * Returns an replacement array for ASCII methods with one language. * * For example, German will map 'ä' to 'ae', while other languages * will simply return e.g. 'a'. * * @param string $language [optional] <p>Language of the source string e.g.: en, de_at, or de-ch</p> * @param bool $withExtras [optional] <p>Add some more replacements e.g. "£" with " pound ".</p> * * @return array{orig: string[], replace: string[]} * <p>An array of replacements.</p> */ public static function charsArrayWithOneLanguage( string $language = 'en', bool $withExtras = false ): array { $regex = '/(?<first>[a-z]+)[\-_]\g{first}/i'; $language = \str_replace( '-', '_', \strtolower( (string) \preg_replace($regex, '$1', $language) ) ); // init static $CHARS_ARRAY = []; $cacheKey = '' . $withExtras; // check static cache if (isset($CHARS_ARRAY[$cacheKey][$language])) { return $CHARS_ARRAY[$cacheKey][$language]; } if ($withExtras) { self::prepareAsciiExtrasMaps(); if (isset(self::$ASCII_MAPS[$language])) { $tmpArray = \array_merge(self::$ASCII_MAPS[$language] + self::$ASCII_MAPS_EXTRAS[$language]); $CHARS_ARRAY[$cacheKey][$language] = [ 'orig' => \array_keys($tmpArray), 'replace' => \array_values($tmpArray), ]; } else { $CHARS_ARRAY[$cacheKey][$language] = [ 'orig' => '', 'replace' => '', ]; } } else { self::prepareAsciiMaps(); if (isset(self::$ASCII_MAPS[$language])) { $tmpArray = self::$ASCII_MAPS[$language]; $CHARS_ARRAY[$cacheKey][$language] = [ 'orig' => \array_keys($tmpArray), 'replace' => \array_values($tmpArray), ]; } else { $CHARS_ARRAY[$cacheKey][$language] = [ 'orig' => '', 'replace' => '', ]; } } return $CHARS_ARRAY[$cacheKey][$language]; } /** * Returns an replacement array for ASCII methods with multiple languages. * * @param bool $withExtras [optional] <p>Add some more replacements e.g. "£" with " pound ".</p> * * @return array{orig: string[], replace: string[]} * <p>An array of replacements.</p> */ public static function charsArrayWithSingleLanguageValues(bool $withExtras = false): array { // init static $CHARS_ARRAY = []; $cacheKey = '' . $withExtras; /** @noinspection NullCoalescingOperatorCanBeUsedInspection */ if (isset($CHARS_ARRAY[$cacheKey])) { return $CHARS_ARRAY[$cacheKey]; } if ($withExtras) { self::prepareAsciiExtrasMaps(); /** @noinspection AlterInForeachInspection */ foreach (self::$ASCII_MAPS as &$map) { $CHARS_ARRAY[$cacheKey][] = $map; } /** @noinspection AlterInForeachInspection */ foreach (self::$ASCII_MAPS_EXTRAS as &$map) { $CHARS_ARRAY[$cacheKey][] = $map; } } else { self::prepareAsciiMaps(); /** @noinspection AlterInForeachInspection */ foreach (self::$ASCII_MAPS as &$map) { $CHARS_ARRAY[$cacheKey][] = $map; } } $CHARS_ARRAY[$cacheKey] = \array_merge([], ...$CHARS_ARRAY[$cacheKey]); $CHARS_ARRAY[$cacheKey] = [ 'orig' => \array_keys($CHARS_ARRAY[$cacheKey]), 'replace' => \array_values($CHARS_ARRAY[$cacheKey]), ]; return $CHARS_ARRAY[$cacheKey]; } /** * Accepts a string and removes all non-UTF-8 characters from it + extras if needed. * * @param string $str <p>The string to be sanitized.</p> * @param bool $normalize_whitespace [optional] <p>Set to true, if you need to normalize the * whitespace.</p> * @param bool $normalize_msword [optional] <p>Set to true, if you need to normalize MS Word chars * e.g.: "?" * => "..."</p> * @param bool $keep_non_breaking_space [optional] <p>Set to true, to keep non-breaking-spaces, in * combination with * $normalize_whitespace</p> * @param bool $remove_invisible_characters [optional] <p>Set to false, if you not want to remove invisible * characters e.g.: "\0"</p> * * @return string clean UTF-8 encoded string */ public static function clean( string $str, bool $normalize_whitespace = true, bool $keep_non_breaking_space = false, bool $normalize_msword = true, bool $remove_invisible_characters = true ): string { // http://stackoverflow.com/questions/1401317/remove-non-utf8-characters-from-string // caused connection reset problem on larger strings $regex = '/ ( (?: [\x00-\x7F] # single-byte sequences 0xxxxxxx | [\xC0-\xDF][\x80-\xBF] # double-byte sequences 110xxxxx 10xxxxxx | [\xE0-\xEF][\x80-\xBF]{2} # triple-byte sequences 1110xxxx 10xxxxxx * 2 | [\xF0-\xF7][\x80-\xBF]{3} # quadruple-byte sequence 11110xxx 10xxxxxx * 3 ){1,100} # ...one or more times ) | ( [\x80-\xBF] ) # invalid byte in range 10000000 - 10111111 | ( [\xC0-\xFF] ) # invalid byte in range 11000000 - 11111111 /x'; /** @noinspection NotOptimalRegularExpressionsInspection */ $str = (string) \preg_replace($regex, '$1', $str); if ($normalize_whitespace === true) { $str = self::normalize_whitespace($str, $keep_non_breaking_space); } if ($normalize_msword === true) { $str = self::normalize_msword($str); } if ($remove_invisible_characters === true) { $str = self::remove_invisible_characters($str); } return $str; } /** * Checks if a string is 7 bit ASCII. * * @param string $str <p>The string to check.</p> * * @return bool * <strong>true</strong> if it is ASCII<br> * <strong>false</strong> otherwise */ public static function is_ascii(string $str): bool { if ($str === '') { return true; } return !\preg_match('/[^\x09\x10\x13\x0A\x0D\x20-\x7E]/', $str); } /** * Returns a string with smart quotes, ellipsis characters, and dashes from * Windows-1252 (commonly used in Word documents) replaced by their ASCII * equivalents. * * @param string $str <p>The string to be normalized.</p> * * @return string */ public static function normalize_msword(string $str): string { if ($str === '') { return ''; } $keys = [ "\xc2\xab", // « (U+00AB) in UTF-8 "\xc2\xbb", // » (U+00BB) in UTF-8 "\xe2\x80\x98", // ? (U+2018) in UTF-8 "\xe2\x80\x99", // ? (U+2019) in UTF-8 "\xe2\x80\x9a", // ? (U+201A) in UTF-8 "\xe2\x80\x9b", // ? (U+201B) in UTF-8 "\xe2\x80\x9c", // ? (U+201C) in UTF-8 "\xe2\x80\x9d", // ? (U+201D) in UTF-8 "\xe2\x80\x9e", // ? (U+201E) in UTF-8 "\xe2\x80\x9f", // ? (U+201F) in UTF-8 "\xe2\x80\xb9", // ? (U+2039) in UTF-8 "\xe2\x80\xba", // ? (U+203A) in UTF-8 "\xe2\x80\x93", // ? (U+2013) in UTF-8 "\xe2\x80\x94", // ? (U+2014) in UTF-8 "\xe2\x80\xa6", // ? (U+2026) in UTF-8 ]; $values = [ '"', // « (U+00AB) in UTF-8 '"', // » (U+00BB) in UTF-8 "'", // ? (U+2018) in UTF-8 "'", // ? (U+2019) in UTF-8 "'", // ? (U+201A) in UTF-8 "'", // ? (U+201B) in UTF-8 '"', // ? (U+201C) in UTF-8 '"', // ? (U+201D) in UTF-8 '"', // ? (U+201E) in UTF-8 '"', // ? (U+201F) in UTF-8 "'", // ? (U+2039) in UTF-8 "'", // ? (U+203A) in UTF-8 '-', // ? (U+2013) in UTF-8 '-', // ? (U+2014) in UTF-8 '...', // ? (U+2026) in UTF-8 ]; return \str_replace($keys, $values, $str); } /** * Normalize the whitespace. * * @param string $str <p>The string to be normalized.</p> * @param bool $keepNonBreakingSpace [optional] <p>Set to true, to keep non-breaking-spaces.</p> * @param bool $keepBidiUnicodeControls [optional] <p>Set to true, to keep non-printable (for the web) * bidirectional text chars.</p> * * @return string */ public static function normalize_whitespace( string $str, bool $keepNonBreakingSpace = false, bool $keepBidiUnicodeControls = false ): string { if ($str === '') { return ''; } static $WHITESPACE_CACHE = []; $cacheKey = (int) $keepNonBreakingSpace; if (!isset($WHITESPACE_CACHE[$cacheKey])) { self::prepareAsciiMaps(); $WHITESPACE_CACHE[$cacheKey] = self::$ASCII_MAPS[' ']; if ($keepNonBreakingSpace === true) { unset($WHITESPACE_CACHE[$cacheKey]["\xc2\xa0"]); } $WHITESPACE_CACHE[$cacheKey] = \array_keys($WHITESPACE_CACHE[$cacheKey]); } if ($keepBidiUnicodeControls === false) { static $BIDI_UNICODE_CONTROLS_CACHE = null; if ($BIDI_UNICODE_CONTROLS_CACHE === null) { $BIDI_UNICODE_CONTROLS_CACHE = \array_values(self::$BIDI_UNI_CODE_CONTROLS_TABLE); } $str = \str_replace($BIDI_UNICODE_CONTROLS_CACHE, '', $str); } return \str_replace($WHITESPACE_CACHE[$cacheKey], ' ', $str); } /** * Remove invisible characters from a string. * * e.g.: This prevents sandwiching null characters between ascii characters, like Java\0script. * * copy&past from https://github.com/bcit-ci/CodeIgniter/blob/develop/system/core/Common.php * * @param string $str * @param bool $url_encoded * @param string $replacement * * @return string */ public static function remove_invisible_characters( string $str, bool $url_encoded = true, string $replacement = '' ): string { // init $non_displayables = []; // every control character except newline (dec 10), // carriage return (dec 13) and horizontal tab (dec 09) if ($url_encoded) { $non_displayables[] = '/%0[0-8bcefBCEF]/'; // url encoded 00-08, 11, 12, 14, 15 $non_displayables[] = '/%1[0-9a-fA-F]/'; // url encoded 16-31 } $non_displayables[] = '/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]+/S'; // 00-08, 11, 12, 14-31, 127 do { $str = (string) \preg_replace($non_displayables, $replacement, $str, -1, $count); } while ($count !== 0); return $str; } /** * Returns an ASCII version of the string. A set of non-ASCII characters are * replaced with their closest ASCII counterparts, and the rest are removed * by default. The language or locale of the source string can be supplied * for language-specific transliteration in any of the following formats: * en, en_GB, or en-GB. For example, passing "de" results in "äöü" mapping * to "aeoeue" rather than "aou" as in other languages. * * @param string $str <p>The input string.</p> * @param string $language [optional] <p>Language of the source string.</p> * @param bool $removeUnsupported [optional] <p>Whether or not to remove the * unsupported characters.</p> * * @return string * <p>A string that contains only ASCII characters.</p> */ public static function to_ascii( string $str, string $language = 'en', bool $removeUnsupported = true ): string { if ($str === '') { return ''; } $langSpecific = self::charsArrayWithOneLanguage($language); if (!empty($langSpecific)) { $str = \str_replace($langSpecific['orig'], $langSpecific['replace'], $str); } foreach (self::charsArrayWithMultiLanguageValues() as $replace => $orig) { $str = \str_replace($orig, $replace, $str); } if ($removeUnsupported) { $str = (string) \str_replace(["\n\r", "\n", "\r", "\t"], ' ', $str); /** @noinspection NotOptimalRegularExpressionsInspection */ $str = (string) \preg_replace('/[^\\x20-\\x7E]/u', '', $str); } return $str; } /** * Convert given string to safe filename (and keep string case). * * @param string $str * @param bool $use_transliterate No transliteration, conversion etc. is done by default - unsafe characters are * simply replaced with hyphen. * @param string $fallback_char * * @return string */ public static function to_filename( string $str, bool $use_transliterate = false, string $fallback_char = '-' ): string { if ($use_transliterate === true) { $str = self::to_transliterate($str, $fallback_char); } $fallback_char_escaped = \preg_quote($fallback_char, '/'); $str = (string) \preg_replace( [ '/[^' . $fallback_char_escaped . '\\.\\-a-zA-Z0-9\\s]/', // 1) remove un-needed chars '/[\\s]+/u', // 2) convert spaces to $fallback_char '/[' . $fallback_char_escaped . ']+/u', // 3) remove double $fallback_char's ], [ '', $fallback_char, $fallback_char, ], $str ); return \trim($str, $fallback_char); } /** * Converts the string into an URL slug. This includes replacing non-ASCII * characters with their closest ASCII equivalents, removing remaining * non-ASCII and non-alphanumeric characters, and replacing whitespace with * $separator. The separator defaults to a single dash, and the string * is also converted to lowercase. The language of the source string can * also be supplied for language-specific transliteration. * * @param string $str * @param string $separator [optional] <p>The string used to replace whitespace.</p> * @param string $language [optional] <p>Language of the source string.</p> * @param string[] $replacements [optional] <p>A map of replaceable strings.</p> * * @return string * <p>A string that has been converted to an URL slug.</p> */ public static function to_slugify( string $str, string $separator = '-', string $language = 'en', array $replacements = [] ): string { if ($str === '') { return ''; } foreach ($replacements as $from => $to) { $str = \str_replace($from, $to, $str); } $langSpecific = self::charsArrayWithOneLanguage($language, true); if (\count($langSpecific['orig']) > 0) { $str = \str_replace($langSpecific['orig'], $langSpecific['replace'], $str); } $charsArray = self::charsArrayWithSingleLanguageValues(true); $str = \str_replace($charsArray['orig'], $charsArray['replace'], $str); /** @noinspection CascadeStringReplacementInspection - FP */ $str = \str_replace('@', $separator, $str); $str = (string) \preg_replace( '/[^a-zA-Z\\d\\s\\-_' . \preg_quote($separator, '/') . ']/u', '', $str ); $str = (string) \preg_replace('/^[\'\\s]+|[\'\\s]+$/', '', \strtolower($str)); $str = (string) \preg_replace('/\\B([A-Z])/', '/-\\1/', $str); $str = (string) \preg_replace('/[\\-_\\s]+/', $separator, $str); $l = \strlen($separator); if (\strpos($str, $separator) === 0) { $str = (string) \substr($str, $l); } if (\substr($str, -$l) === $separator) { $str = (string) \substr($str, 0, \strlen($str) - $l); } return $str; } /** * Returns an ASCII version of the string. A set of non-ASCII characters are * replaced with their closest ASCII counterparts, and the rest are removed * unless instructed otherwise. * * @param string $str <p>The input string.</p> * @param string $unknown [optional] <p>Character use if character unknown. (default is ?)</p> * @param bool $strict [optional] <p>Use "transliterator_transliterate()" from PHP-Intl | WARNING: bad * performance</p> * * @return string * <p>A String that contains only ASCII characters.</p> */ public static function to_transliterate( string $str, string $unknown = '?', bool $strict = false ): string { static $UTF8_TO_ASCII; static $SUPPORT = []; if ($str === '') { return ''; } if (!isset($SUPPORT['intl'])) { $SUPPORT['intl'] = \extension_loaded('intl'); } // check if we only have ASCII, first (better performance) if (self::is_ascii($str) === true) { return $str; } $str = self::clean($str); // check again, if we only have ASCII, now ... if (self::is_ascii($str) === true) { return $str; } if ( $strict === true && $SUPPORT['intl'] === true ) { // INFO: https://unicode.org/cldr/utility/character.jsp?a=%E2%84%8C /** @noinspection PhpComposerExtensionStubsInspection */ /** @noinspection UnnecessaryCastingInspection */ $str = (string) \transliterator_transliterate('NFKC; [:Nonspacing Mark:] Remove; NFKC; Any-Latin; Latin-ASCII;', $str); // check again, if we only have ASCII, now ... if (self::is_ascii($str) === true) { return $str; } } if (self::$ORD === null) { self::$ORD = self::getData('ascii_ord'); } \preg_match_all('/.|[^\x00]$/us', $str, $ar); $chars = $ar[0]; $ord = null; /** @noinspection ForeachSourceInspection */ foreach ($chars as &$c) { $ordC0 = self::$ORD[$c[0]]; if ($ordC0 >= 0 && $ordC0 <= 127) { continue; } $ordC1 = self::$ORD[$c[1]]; // ASCII - next please if ($ordC0 >= 192 && $ordC0 <= 223) { $ord = ($ordC0 - 192) * 64 + ($ordC1 - 128); } if ($ordC0 >= 224) { $ordC2 = self::$ORD[$c[2]]; if ($ordC0 <= 239) { $ord = ($ordC0 - 224) * 4096 + ($ordC1 - 128) * 64 + ($ordC2 - 128); } if ($ordC0 >= 240) { $ordC3 = self::$ORD[$c[3]]; if ($ordC0 <= 247) { $ord = ($ordC0 - 240) * 262144 + ($ordC1 - 128) * 4096 + ($ordC2 - 128) * 64 + ($ordC3 - 128); } if ($ordC0 >= 248) { $ordC4 = self::$ORD[$c[4]]; if ($ordC0 <= 251) { $ord = ($ordC0 - 248) * 16777216 + ($ordC1 - 128) * 262144 + ($ordC2 - 128) * 4096 + ($ordC3 - 128) * 64 + ($ordC4 - 128); } if ($ordC0 >= 252) { $ordC5 = self::$ORD[$c[5]]; if ($ordC0 <= 253) { $ord = ($ordC0 - 252) * 1073741824 + ($ordC1 - 128) * 16777216 + ($ordC2 - 128) * 262144 + ($ordC3 - 128) * 4096 + ($ordC4 - 128) * 64 + ($ordC5 - 128); } } } } } if ($ordC0 === 254 || $ordC0 === 255) { $c = $unknown; continue; } if ($ord === null) { $c = $unknown; continue; } $bank = $ord >> 8; if (!isset($UTF8_TO_ASCII[$bank])) { $UTF8_TO_ASCII[$bank] = self::getDataIfExists(\sprintf('x%02x', $bank)); if ($UTF8_TO_ASCII[$bank] === false) { $UTF8_TO_ASCII[$bank] = []; } } $newchar = $ord & 255; /** @noinspection NullCoalescingOperatorCanBeUsedInspection */ if (isset($UTF8_TO_ASCII[$bank][$newchar])) { // keep for debugging /* echo "file: " . sprintf('x%02x', $bank) . "\n"; echo "char: " . $c . "\n"; echo "ord: " . $ord . "\n"; echo "newchar: " . $newchar . "\n"; echo "ascii: " . $UTF8_TO_ASCII[$bank][$newchar] . "\n"; echo "bank:" . $bank . "\n\n"; */ $c = $UTF8_TO_ASCII[$bank][$newchar]; } else { // keep for debugging missing chars /* echo "file: " . sprintf('x%02x', $bank) . "\n"; echo "char: " . $c . "\n"; echo "ord: " . $ord . "\n"; echo "newchar: " . $newchar . "\n"; echo "bank:" . $bank . "\n\n"; */ $c = $unknown; } } return \implode('', $chars); } /** * get data from "/data/*.php" * * @param string $file * * @return array */ private static function getData(string $file): array { /** @noinspection PhpIncludeInspection */ /** @noinspection UsingInclusionReturnValueInspection */ /** @psalm-suppress UnresolvableInclude */ return include __DIR__ . '/data/' . $file . '.php'; } /** * get data from "/data/*.php" * * @param string $file * * @return false|mixed will return false on error */ private static function getDataIfExists(string $file) { $file = __DIR__ . '/data/' . $file . '.php'; if (\file_exists($file)) { /** @noinspection PhpIncludeInspection */ /** @noinspection UsingInclusionReturnValueInspection */ return include $file; } return false; } private static function prepareAsciiExtrasMaps() { if (self::$ASCII_MAPS_EXTRAS === null) { self::prepareAsciiMaps(); self::$ASCII_MAPS_EXTRAS = \array_merge( self::$ASCII_MAPS, self::getData('ascii_extras_by_languages') ); } } private static function prepareAsciiMaps() { if (self::$ASCII_MAPS === null) { self::$ASCII_MAPS = self::getData('ascii_by_languages'); } } }