Code Coverage for /Users/ericduran/Sites/drupal/drupal/core/lib/Drupal/Component/Transliteration/PhpTransliteration.php

	Code Coverage
	Classes and Traits			Functions and Methods				Lines
Total	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	8 / 8	CRAP	100.00% covered (success)	100.00%	65 / 65
PhpTransliteration	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	8 / 8	36	100.00% covered (success)	100.00%	65 / 65
__construct				100.00% covered (success)	100.00%	1 / 1	2	100.00% covered (success)	100.00%	2 / 2
removeDiacritics				100.00% covered (success)	100.00%	1 / 1	9	100.00% covered (success)	100.00%	14 / 14
transliterate				100.00% covered (success)	100.00%	1 / 1	5	100.00% covered (success)	100.00%	13 / 13
ordUTF8				100.00% covered (success)	100.00%	1 / 1	5	100.00% covered (success)	100.00%	10 / 10
replace				100.00% covered (success)	100.00%	1 / 1	4	100.00% covered (success)	100.00%	7 / 7
lookupReplacement				100.00% covered (success)	100.00%	1 / 1	3	100.00% covered (success)	100.00%	5 / 5
readLanguageOverrides				100.00% covered (success)	100.00%	1 / 1	4	100.00% covered (success)	100.00%	7 / 7
readGenericData				100.00% covered (success)	100.00%	1 / 1	4	100.00% covered (success)	100.00%	7 / 7

1	<?php
2
3	/**
4	* @file
5	* Contains \Drupal\Component\Transliteration\PhpTransliteration.
6	*
7	* Some parts of this code were derived from the MediaWiki project's UtfNormal
8	* class, Copyright © 2004 Brion Vibber <brion@pobox.com>,
9	* http://www.mediawiki.org/
10	*/
11
12	namespace Drupal\Component\Transliteration;
13
14	/**
15	* Implements transliteration without using the PECL extensions.
16	*
17	* Transliterations are done character-by-character, by looking up non-US-ASCII
18	* characters in a transliteration database.
19	*
20	* The database comes from two types of files, both of which are searched for in
21	* the PhpTransliteration::$dataDirectory directory. First, language-specific
22	* overrides are searched (see PhpTransliteration::readLanguageOverrides()). If
23	* there is no language-specific override for a character, the generic
24	* transliteration character tables are searched (see
25	* PhpTransliteration::readGenericData()). If looking up the character in the
26	* generic table results in a NULL value, or an illegal character is
27	* encountered, then a substitute character is returned.
28	*/
29	class PhpTransliteration implements TransliterationInterface {
30
31	/**
32	* Directory where data for transliteration resides.
33	*
34	* The constructor sets this (by default) to subdirectory 'data' underneath
35	* the directory where the class's PHP file resides.
36	*
37	* @var string
38	*/
39	protected $dataDirectory;
40
41	/**
42	* Associative array of language-specific character transliteration tables.
43	*
44	* The outermost array keys are language codes. For each language code key,
45	* the value is an array whose keys are Unicode character codes, and whose
46	* values are the transliterations of those characters to US-ASCII. This is
47	* set up as needed in PhpTransliteration::replace() by calling
48	* PhpTransliteration::readLanguageOverrides().
49	*
50	* @var array
51	*/
52	protected $languageOverrides = array();
53
54	/**
55	* Non-language-specific transliteration tables.
56	*
57	* Array whose keys are the upper two bytes of the Unicode character, and
58	* whose values are an array of transliterations for each lower-two bytes
59	* character code. This is set up as needed in PhpTransliteration::replace()
60	* by calling PhpTransliteration::readGenericData().
61	*
62	* @var array
63	*/
64	protected $genericMap = array();
65
66	/**
67	* Constructs a transliteration object.
68	*
69	* @param string $data_directory
70	* (optional) The directory where data files reside. If omitted, defaults
71	* to subdirectory 'data' underneath the directory where the class's PHP
72	* file resides.
73	*/
74	public function __construct($data_directory = NULL) {
75	$this->dataDirectory = (isset($data_directory)) ? $data_directory : __DIR__ . '/data';
76	}
77
78	/**
79	* {@inheritdoc}
80	*/
81	public function removeDiacritics($string) {
82	$result = '';
83
84	foreach (preg_split('//u', $string, 0, PREG_SPLIT_NO_EMPTY) as $character) {
85	$code = self::ordUTF8($character);
86
87	// These two Unicode ranges include the accented US-ASCII letters, with a
88	// few characters that aren't accented letters mixed in. So define the
89	// ranges and the excluded characters.
90	$range1 = $code > 0x00bf && $code < 0x017f;
91	$exclusions_range1 = array(0x00d0, 0x00d7, 0x00f0, 0x00f7, 0x0138, 0x014a, 0x014b);
92	$range2 = $code > 0x01cc && $code < 0x0250;
93	$exclusions_range2 = array(0x01DD, 0x01f7, 0x021c, 0x021d, 0x0220, 0x0221, 0x0241, 0x0242, 0x0245);
94
95	$replacement = $character;
96	if (($range1 && !in_array($code, $exclusions_range1)) \|\| ($range2 && !in_array($code, $exclusions_range2))) {
97	$to_add = $this->lookupReplacement($code, 'xyz');
98	if(strlen($to_add) === 1) {
99	$replacement = $to_add;
100	}
101	}
102
103	$result .= $replacement;
104	}
105
106	return $result;
107	}
108
109	/**
110	* {@inheritdoc}
111	*/
112	public function transliterate($string, $langcode = 'en', $unknown_character = '?', $max_length = NULL) {
113	$result = '';
114	$length = 0;
115	// Split into Unicode characters and transliterate each one.
116	foreach (preg_split('//u', $string, 0, PREG_SPLIT_NO_EMPTY) as $character) {
117	$code = self::ordUTF8($character);
118	if ($code == -1) {
119	$to_add = $unknown_character;
120	}
121	else {
122	$to_add = $this->replace($code, $langcode, $unknown_character);
123	}
124
125	// Check if this exceeds the maximum allowed length.
126	if (isset($max_length)) {
127	$length += strlen($to_add);
128	if ($length > $max_length) {
129	// There is no more space.
130	return $result;
131	}
132	}
133
134	$result .= $to_add;
135	}
136
137	return $result;
138	}
139
140	/**
141	* Finds the character code for a UTF-8 character: like ord() but for UTF-8.
142	*
143	* @param string $character
144	* A single UTF-8 character.
145	*
146	* @return int
147	* The character code, or -1 if an illegal character is found.
148	*/
149	protected static function ordUTF8($character) {
150	$first_byte = ord($character[0]);
151
152	if (($first_byte & 0x80) == 0) {
153	// Single-byte form: 0xxxxxxxx.
154	return $first_byte;
155	}
156	if (($first_byte & 0xe0) == 0xc0) {
157	// Two-byte form: 110xxxxx 10xxxxxx.
158	return (($first_byte & 0x1f) << 6) + (ord($character[1]) & 0x3f);
159	}
160	if (($first_byte & 0xf0) == 0xe0) {
161	// Three-byte form: 1110xxxx 10xxxxxx 10xxxxxx.
162	return (($first_byte & 0x0f) << 12) + ((ord($character[1]) & 0x3f) << 6) + (ord($character[2]) & 0x3f);
163	}
164	if (($first_byte & 0xf8) == 0xf0) {
165	// Four-byte form: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx.
166	return (($first_byte & 0x07) << 18) + ((ord($character[1]) & 0x3f) << 12) + ((ord($character[2]) & 0x3f) << 6) + (ord($character[3]) & 0x3f);
167	}
168
169	// Other forms are not legal.
170	return -1;
171	}
172
173	/**
174	* Replaces a single Unicode character using the transliteration database.
175	*
176	* @param int $code
177	* The character code of a Unicode character.
178	* @param string $langcode
179	* The language code of the language the character is in.
180	* @param string $unknown_character
181	* The character to substitute for characters without transliterated
182	* equivalents.
183	*
184	* @return string
185	* US-ASCII replacement character. If it has a mapping, it is returned;
186	* otherwise, $unknown_character is returned. The replacement can contain
187	* multiple characters.
188	*/
189	protected function replace($code, $langcode, $unknown_character) {
190	if ($code < 0x80) {
191	// Already lower ASCII.
192	return chr($code);
193	}
194
195	// See if there is a language-specific override for this character.
196	if (!isset($this->languageOverrides[$langcode])) {
197	$this->readLanguageOverrides($langcode);
198	}
199	if (isset($this->languageOverrides[$langcode][$code])) {
200	return $this->languageOverrides[$langcode][$code];
201	}
202
203	return $this->lookupReplacement($code, $unknown_character);
204	}
205
206	/**
207	* Look up the generic replacement for a UTF-8 character code.
208	*
209	* @param $code
210	* The UTF-8 character code.
211	* @param string $unknown_character
212	* (optional) The character to substitute for characters without entries in
213	* the replacement tables.
214	*
215	* @return string
216	* US-ASCII replacement characters. If it has a mapping, it is returned;
217	* otherwise, $unknown_character is returned. The replacement can contain
218	* multiple characters.
219	*/
220	protected function lookupReplacement($code, $unknown_character = '?') {
221	// See if there is a generic mapping for this character.
222	$bank = $code >> 8;
223	if (!isset($this->genericMap[$bank])) {
224	$this->readGenericData($bank);
225	}
226	$code = $code & 0xff;
227	return isset($this->genericMap[$bank][$code]) ? $this->genericMap[$bank][$code] : $unknown_character;
228	}
229
230	/**
231	* Reads in language overrides for a language code.
232	*
233	* The data is read from files named "$langcode.php" in
234	* PhpTransliteration::$dataDirectory. These files should set up an array
235	* variable $overrides with an element whose key is $langcode and whose value
236	* is an array whose keys are character codes, and whose values are their
237	* transliterations in this language. The character codes can be for any valid
238	* Unicode character, independent of the number of bytes.
239	*
240	* @param $langcode
241	* Code for the language to read.
242	*/
243	protected function readLanguageOverrides($langcode) {
244	// Figure out the file name to use by sanitizing the language code,
245	// just in case.
246	$file = $this->dataDirectory . '/' . preg_replace('/[^a-zA-Z\-]/', '', $langcode) . '.php';
247
248	// Read in this file, which should set up a variable called $overrides,
249	// which will be local to this function.
250	if (is_file($file)) {
251	include $file;
252	}
253	if (!isset($overrides) \|\| !is_array($overrides)) {
254	$overrides = array($langcode => array());
255	}
256	$this->languageOverrides[$langcode] = $overrides[$langcode];
257	}
258
259	/**
260	* Reads in generic transliteration data for a bank of characters.
261	*
262	* The data is read in from a file named "x$bank.php" (with $bank in
263	* hexadecimal notation) in PhpTransliteration::$dataDirectory. These files
264	* should set up a variable $bank containing an array whose numerical indices
265	* are the remaining two bytes of the character code, and whose values are the
266	* transliterations of these characters into US-ASCII. Note that the maximum
267	* Unicode character that can be encoded in this way is 4 bytes.
268	*
269	* @param $bank
270	* First two bytes of the Unicode character, or 0 for the ASCII range.
271	*/
272	protected function readGenericData($bank) {
273	// Figure out the file name.
274	$file = $this->dataDirectory . '/x' . sprintf('%02x', $bank) . '.php';
275
276	// Read in this file, which should set up a variable called $base, which
277	// will be local to this function.
278	if (is_file($file)) {
279	include $file;
280	}
281	if (!isset($base) \|\| !is_array($base)) {
282	$base = array();
283	}
284
285	// Save this data.
286	$this->genericMap[$bank] = $base;
287	}
288	}