. * * ---------------------------------------------------------------------- * * Class Name: Tagging Arabic Word Class * * Filename: ArWordTag.class.php * * Original Author(s): Khaled Al-Sham'aa * * Purpose: Arabic grammarians describe Arabic as being derived from * three main categories: noun, verb and particle. This class * built to recognize the class of a given Arabic word. * * ---------------------------------------------------------------------- * * Tagging Arabic Word * * This PHP Class can identifying names, places, dates, and other noun * words and phrases in Arabic language that establish the meaning of a body * of text. * * This process of identifying names, places, dates, and other noun words and * phrases that establish the meaning of a body of text-is critical to software * systems that process large amounts of unstructured data coming from sources such * as email, document files, and the Web. * * Arabic words are classifies into three main classes, namely, verb, noun and * particle. Verbs are sub classified into three subclasses (Past verbs, Present * Verbs, etc.); nouns into forty six subclasses (e.g. Active participle, Passive * participle, Exaggeration pattern, Adjectival noun, Adverbial noun, Infinitive * noun, Common noun, Pronoun, Quantifier, etc.) and particles into twenty three * subclasses (e.g. additional, resumption, Indefinite, Conditional, Conformational, * Prohibition, Imperative, Optative, Reasonal, Dubious, etc.), and from these three * main classes that the rest of the language is derived. * * The most important aspect of this system of describing Arabic is that all the * subclasses of these three main classes inherit properties from the parent * classes. * * Arabic is very rich in categorising words, and contains classes for almost every * form of word imaginable. For example, there are classes for nouns of instruments, * nouns of place and time, nouns of activity and so on. If we tried to use all the * subclasses described by Arabic grammarians, the size of the tagset would soon * reach more than two or three hundred tags. For this reason, we have chosen only * the main classes. But because of the way all the classes inherit from others, it * would be quite simple to extend this tagset to include more subclasses. * * Example: * * include('./Arabic.php'); * $obj = new Arabic('ArWordTag'); * * $hStr=$obj->highlightText($str,'#80B020'); * * echo $str . '
' . $hStr . '
'; * * $taggedText = $obj->tagText($str); * * foreach($taggedText as $wordTag) { * list($word, $tag) = $wordTag; * * if ($tag == 1) { * echo "$word is Noun, "; * } * * if ($tag == 0) { * echo "$word is not Noun, "; * } * } *
* * @category I18N * @package Arabic * @author Khaled Al-Shamaa * @copyright 2006-2010 Khaled Al-Shamaa * * @license LGPL * @link http://www.ar-php.org */ // New in PHP V5.3: Namespaces // namespace I18N/Arabic/ArWordTag; /** * This PHP class to tagging Arabic Word * * @category I18N * @package Arabic * @author Khaled Al-Shamaa * @copyright 2006-2010 Khaled Al-Shamaa * * @license LGPL * @link http://www.ar-php.org */ class ArWordTag { protected static $particlePreNouns = array('عن', 'في', 'مذ', 'منذ', 'من', 'الى', 'على', 'حتى', 'الا', 'غير', 'سوى', 'خلا', 'عدا', 'حاشا', 'ليس'); protected static $normalizeAlef = array('أ','إ','آ'); protected static $normalizeDiacritics = array('َ','ً','ُ','ٌ','ِ','ٍ','ْ','ّ'); /** * "isNoun" method input charset * @var String */ public $isNounInput = 'windows-1256'; /** * Name of the textual "isNoun" method parameters * @var Array */ public $isNounVars = array('word', 'word_befor'); /** * "tagText" method output charset * @var String */ public $tagTextOutput = 'windows-1256'; /** * "tagText" method input charset * @var String */ public $tagTextInput = 'windows-1256'; /** * Name of the textual "tagText" method parameters * @var Array */ public $tagTextVars = array('str'); /** * "highlightText" method output charset * @var String */ public $highlightTextOutput = 'windows-1256'; /** * "highlightText" method input charset * @var String */ public $highlightTextInput = 'windows-1256'; /** * Name of the textual "highlightText" method parameters * @var Array */ public $highlightTextVars = array('str'); /** * Loads initialize values */ public function __construct() { } /** * Check if given rabic word is noun or not * * @param string $word Word you want to check if it is * noun (windows-1256) * @param string $word_befor The word before word you want to check * * @return boolean TRUE if given word is Arabic noun * @author Khaled Al-Shamaa */ public static function isNoun($word, $word_befor) { $word = trim($word); $word_befor = trim($word_befor); $word = str_replace(self::$normalizeAlef, 'ا', $word); $word_befor = str_replace(self::$normalizeAlef, 'ا', $word_befor); $wordLen = strlen($word); // إذا سبق بحرف جر فهو اسم مجرور if (in_array($word_befor, self::$particlePreNouns)) { return true; } // إذا سبق بعدد فهو معدود if (is_numeric($word) || is_numeric($word_befor)) { return true; } // إذا كان منون if ($word[$wordLen - 1] == 'ً' || $word[$wordLen - 1] == 'ٌ' || $word[$wordLen - 1] == 'ٍ') { return true; } $word = str_replace(self::$normalizeDiacritics, '', $word); $wordLen = strlen($word); // إن كان معرف بأل التعريف if ($word[0] == 'ا' && $word[1] == 'ل' && $wordLen >= 5) { return true; } // إذا كان في الكلمة ثلاث ألفات // إن لم تكن الألف الثالثة متطرفة if (substr_count($word, 'ا') >= 3) { return true; } // إن كان مؤنث تأنيث لفظي، منتهي بتاء مربوطة // أو همزة أو ألف مقصورة if (($word[$wordLen - 1] == 'ة' || $word[$wordLen - 1] == 'ء' || $word[$wordLen - 1] == 'ى') && $wordLen >= 4) { return true; } // مؤنث تأنيث لفظي، // منتهي بألف وتاء مفتوحة - جمع مؤنث سالم if ($word[$wordLen - 1] == 'ت' && $word[$wordLen - 2] == 'ا' && $wordLen >= 5) { return true; } // started by Noon, before REH or LAM, or Noon, is a verb and not a noun if ($word[0] == 'ن' && ($word[1] == 'ر' || $word[1] == 'ل' || $word[1] == 'ن') && $wordLen > 3) { return false; } // started by YEH, before some letters is a verb and not a noun // YEH,THAL,JEEM,HAH,KHAH,ZAIN,SHEEN,SAD,DAD,TAH,ZAH,GHAIN,KAF if ($word[0] == 'ي' && (strpos('يذجهخزشصضطظغك', $word[1]) !== false) && $wordLen > 3) { return false; } // started by beh or meem, before BEH,FEH,MEEM is a noun and not a verb if (($word[0] == 'ب' || $word[0] == 'م') && ($word[1] == 'ب' || $word[1] == 'ف' || $word[1] == 'م') && $wordLen > 3) { return true; } // الكلمات التي تنتهي بياء ونون // أو ألف ونون أو ياء ونون // تكون أسماء ما لم تبدأ بأحد حروف المضارعة if (preg_match('/^[^ايتن]\S{2}[اوي]ن$/', $word)) { return true; } // إن كان على وزن اسم الآلة // أو اسم المكان أو اسم الزمان if (preg_match('/^م\S{3}$/', $word) || preg_match('/^م\S{2}ا\S$/', $word) || preg_match('/^م\S{3}ة$/', $word) || preg_match('/^\S{2}ا\S$/', $word) || preg_match('/^\Sا\Sو\S$/', $word) || preg_match('/^\S{2}و\S$/', $word) || preg_match('/^\S{2}ي\S$/', $word) || preg_match('/^م\S{2}و\S$/', $word) || preg_match('/^م\S{2}ي\S$/', $word) || preg_match('/^\S{3}ة$/', $word) || preg_match('/^\S{2}ا\Sة$/', $word) || preg_match('/^\Sا\S{2}ة$/', $word) || preg_match('/^\Sا\Sو\Sة$/', $word) || preg_match('/^ا\S{2}و\Sة$/', $word) || preg_match('/^ا\S{2}ي\S$/', $word) || preg_match('/^ا\S{3}$/', $word) || preg_match('/^\S{3}ى$/', $word) || preg_match('/^\S{3}اء$/', $word) || preg_match('/^\S{3}ان$/', $word) || preg_match('/^م\Sا\S{2}$/', $word) || preg_match('/^من\S{3}$/', $word) || preg_match('/^مت\S{3}$/', $word) || preg_match('/^مست\S{3}$/', $word) || preg_match('/^م\Sت\S{2}$/', $word) || preg_match('/^مت\Sا\S{2}$/', $word) || preg_match('/^\Sا\S{2}$/', $word)) { return true; } return false; } /** * Tag all words in a given Arabic string if they are nouns or not * * @param string $str Arabic string you want to tag all its words * * @return array Two dimension array where item[i][0] represent the word i * in the given string, and item[i][1] is 1 if that word is * noun and 0 if it is not * @author Khaled Al-Shamaa */ public static function tagText($str) { $text = array(); $words = explode(' ', $str); $prevWord = ''; foreach ($words as $word) { if ($word == '') { continue; } if (self::isNoun($word, $prevWord)) { $text[] = array($word, 1); } else { $text[] = array($word, 0); } $prevWord = $word; } return $text; } /** * Highlighted all nouns in a given Arabic string * * @param string $str Arabic string you want to highlighted * all its nouns * @param string $style Name of the CSS class you would like to apply * * @return string Arabic string in HTML format where all nouns highlighted * @author Khaled Al-Shamaa */ public static function highlightText($str, $style = null) { $html = ''; $prevTag = 0; $prevWord = ''; $taggedText = self::tagText($str); foreach ($taggedText as $wordTag) { list($word, $tag) = $wordTag; if ($prevTag == 1) { if (in_array($word, self::$particlePreNouns)) { $prevWord = $word; continue; } if ($tag == 0) { $html .= " \r\n"; } } else { if ($tag == 1) { $html .= " \r\n"; } } $html .= ' ' . $prevWord . ' ' . $word; if ($prevWord != '') { $prevWord = ''; } $prevTag = $tag; } if ($prevTag == 1) { $html .= " \r\n"; } return $html; } }