. * * ---------------------------------------------------------------------- * * Class Name: Detect Arabic String Character Set * * Filename: ArCharsetD.class.php * * Original Author(s): Khaled Al-Sham'aa * * Purpose: This class will return Arabic character set that used for * a given Arabic string passing into this class, those available * character sets that can be detected by this class includes * the most popular three: Windows-1256, ISO 8859-6, and UTF-8. * * ---------------------------------------------------------------------- * * Detect Arabic String Character Set * * The last step of the Information Retrieval process is to display the found * documents to the user. However, some difficulties might occur at that point. * English texts are usually written in the ASCII standard. Unlike the English * language, many languages have different character sets, and do not have one * standard. This plurality of standards causes problems, especially in a web * environment. * * This PHP class will return Arabic character set that used for a given * Arabic string passing into this class, those available character sets that can * be detected by this class includes the most popular three: Windows-1256, * ISO 8859-6, and UTF-8. * * Example: * * include('./Arabic.php'); * $obj = new Arabic('ArCharsetD'); * * $charset = $obj->ArCharsetD->getCharset($text); * * * @category I18N * @package Arabic * @author Khaled Al-Shamaa * @copyright 2006-2010 Khaled Al-Shamaa * * @license LGPL * @link http://www.ar-php.org */ // New in PHP V5.3: Namespaces // namespace I18N/Arabic/ArCharsetD; /** * This PHP class detect Arabic string character set * * @category I18N * @package Arabic * @author Khaled Al-Shamaa * @copyright 2006-2010 Khaled Al-Shamaa * * @license LGPL * @link http://www.ar-php.org */ class ArCharsetD { /** * Loads initialize values */ public function __construct() { } /** * Count number of hits for the most frequented letters in Arabic language * (Alef, Lam and Yaa), then calculate association ratio with each of * possible character set (UTF-8, Windows-1256 and ISO-8859-6) * * @param String $string Arabic string in unknown format * * @return Array Character set as key and string association ratio as value * @author Khaled Al-Shamaa */ public function guess($string) { $charset['windows-1256'] = substr_count($string, 'Ç'); $charset['windows-1256'] += substr_count($string, 'á'); $charset['windows-1256'] += substr_count($string, 'í'); $charset['iso-8859-6'] = substr_count($string, 'Ç'); $charset['iso-8859-6'] += substr_count($string, 'ä'); $charset['iso-8859-6'] += substr_count($string, 'ê'); $charset['utf-8'] = substr_count($string, 'ا'); $charset['utf-8'] += substr_count($string, 'Ù„'); $charset['utf-8'] += substr_count($string, 'ÙŠ'); $total = $charset['windows-1256'] + $charset['iso-8859-6'] + $charset['utf-8']; $charset['windows-1256'] = round($charset['windows-1256'] * 100 / $total); $charset['iso-8859-6'] = round($charset['iso-8859-6'] * 100 / $total); $charset['utf-8'] = round($charset['utf-8'] * 100 / $total); return $charset; } /** * Find the most possible character set for given Arabic string in unknown * format * * @param String $string Arabic string in unknown format * * @return String The most possible character set for given Arabic string in * unknown format[utf-8|windows-1256|iso-8859-6] * @author Khaled Al-Shamaa */ public function getCharset($string) { if (preg_match('//sim', $string, $matches)) { $value = $matches[1]; } else { $charset = $this->guess($string); arsort($charset); $value = key($charset); } return $value; } }