BASE_URL, 0, 7);
if (stristr($t_urlStart, "http://")===FALSE){
$this->BASE_URL = "http://".$this->BASE_URL;
}
$url_lastChar = substr($this->BASE_URL, -1);
$url_5lastChar = substr($this->BASE_URL, -5);
$this->BASE_URL = str_replace(Array("\r", "\n"), "", $this->BASE_URL );
return true;
}
public function uri_parts($uri){
$a_ret = explode("/", $uri);
return $a_ret;
}
/**
* Function to set the url
*/
public function uri_set($uri){
$this->BASE_URL = $uri;
}
/**
* Set the source from which to scrap
*/
public function source_set($sr){
$this->URL_SOURCECODE = $sr;
}
/**
* Check if SOURCECODE is set
*/
public function source_check(){
$ret = false;
if (isset($this->URL_SOURCECODE) && !empty($this->URL_SOURCECODE)) $ret = true;
return $ret;
}
/**
* Get uri content and store the result
*/
public function source_get($uri=false){
if (!empty($uri)) $this->BASE_URL = $uri;
$ret = false;
if ($this->BASE_URL !== false && $this->BASE_URL != ""){
$this->url_correct(); // correct possible bad url
$ctx = stream_context_create(array(
'http' => array(
'timeout' => $this->GET_CONTENT_TIMEOUT,
'max_redirects' => '5'
)
)
);
$this->URL_SOURCECODE = @file_get_contents (
$this->BASE_URL,
0,
$ctx
);
$ret = $this->URL_SOURCECODE;
}
return $ret;
}
public function links_get($filters=false){
$array_return = false;
if ($this->URL_SOURCECODE !== false){
$array_return = Array();
$txt = str_replace("'",'"',$this->URL_SOURCECODE);
preg_match_all('/href="([^"]+)"/', $txt, $salida);
$matches = $salida[0];
foreach ($matches as $aK => $aV){
$is_valid = true;
// clean the url
$link_d = str_replace(Array(
"href=",
"\\",
"'",
"\"",
"
",
"
",
"
",
"\n",
"\r"), "", $aV);
$link_d = trim($link_d);
$protocol = substr($link_d, 0, 4);
$two_chars = substr($link_d, 0, 2);
// if path relative
if ($protocol != "http" && $two_chars != "//"){
// check for mailto < make a more generic stuff to check more keywords
$mailto = substr($link_d, 0, 7);
$javascript = substr($link_d, 0, 11);
if ($mailto != "mailto:" && $javascript != "javascript:"){
$link_first_char = substr($link_d, 0, 1);
if ($link_first_char == "/"){
$link_d = substr($link_d, 1, strlen($link_d));
}else if($link_d == "#"){
continue;
}
$then_separator = "";
$right_firstChar = substr($link_d, 0, 1);
$left_lastChar = substr($this->BASE_URL, -1);
if ($right_firstChar != "/" && $left_lastChar != "/"){
$then_separator = "/";
}
$s_host = $this->link_host($this->BASE_URL);
$s_proto = $this->link_host($this->BASE_URL, "protocol");
$link_d = $s_proto."://".$s_host."/".$then_separator.$link_d;
}
}
// check the filters
if ($filters !== false){
// get first character of the filter
$first_char_filter = substr($filters, 0, 1);
$REAL_FILTER = $filters; // real filters
if ($first_char_filter=="-" || $first_char_filter=="+"){
$REAL_FILTER = substr($filters, 1, strlen($filters));
}
$FILTER_TO_USE = "+";
if ($first_char_filter=="-") $FILTER_TO_USE = $first_char_filter;
// check filters
$is_valid_result = stristr($link_d, $REAL_FILTER);
if ($FILTER_TO_USE=="-"){
if ($is_valid_result != false) $is_valid = false; // founded, but if (-) not useful
}elseif($FILTER_TO_USE=="+"){
if ($is_valid_result === false) $is_valid = false; // innecessary
}
}
if ($is_valid) $array_return[] = $link_d;
}
}
return $array_return;
}
public function link_is_external($link, $url){
$ret = true;
$compare_to = $this->link_host( $url );
if(stristr($link, $compare_to) != FALSE) {
$ret = false;
}
return $ret;
}
public function link_host($uri, $part = "host"){
$a_url = parse_url($uri);
$s_ret = "";
if ($part == "host"){
if (!isset($a_url["host"])){
$s_ret = $a_url["path"];
}else{
$s_ret = $a_url["host"];
}
}else if($part == "protocol"){
if (isset($a_url["scheme"])) $s_ret = $a_url["scheme"];
}
return $s_ret;
}
public function images_get(){
$array_return = Array();
$txt = str_replace("'",'"',$this->URL_SOURCECODE);
preg_match_all('/src="([^"]+)"/', $txt, $salida);
$matches = $salida[0];
foreach ($matches as $aK => $aV){
$image_link = str_replace('src="','',$aV);
$image_link = substr($image_link, 0, strlen($image_link)-1);
$link_extension = substr($image_link,-3);
if (
$link_extension=="jpg" ||
$link_extension=="png" ||
$link_extension=="gif"
){
$link_d = str_replace("src=", "", $aV);
$link_d = str_replace("\"", "", $link_d);
$link_d = str_replace("'", "", $link_d);
//echo "a:".$link_d."
";
$protocol = substr($link_d, 0, 4);
// if path relative
if ($protocol != "http"){
$link_first_char = substr($link_d, 0, 1);
if ($link_first_char == "/"){
$link_d = substr($link_d, 1, strlen($link_d));
}
$then_separator = "";
$right_firstChar = substr($link_d, 0, 1);
$left_lastChar = substr($this->BASE_URL, -1);
if ($right_firstChar != "/" && $left_lastChar != "/"){
$then_separator = "/";
}
$link_d = $this->BASE_URL.$then_separator.$link_d;
}
$array_return[] = $link_d;
}
}
return $array_return;
}
/**
* Get param src=
*/
public function src_get(){
$array_return = Array();
$txt = str_replace("'",'"',$this->URL_SOURCECODE);
preg_match_all('/src="([^"]+)"/', $txt, $salida);
$matches = $salida[0];
foreach ($matches as $aK => $aV){
$image_link = str_replace('src="','',$aV);
$image_link = substr($image_link, 0, strlen($image_link)-1);
$link_d = str_replace("src=", "", $aV);
$link_d = str_replace("\"", "", $link_d);
$link_d = str_replace("'", "", $link_d);
//echo "a:".$link_d."
";
$protocol = substr($link_d, 0, 4);
// if path relative
if ($protocol != "http"){
$link_first_char = substr($link_d, 0, 1);
if ($link_first_char == "/"){
$link_d = substr($link_d, 1, strlen($link_d));
}
$then_separator = "";
$right_firstChar = substr($link_d, 0, 1);
$left_lastChar = substr($this->BASE_URL, -1);
if ($right_firstChar != "/" && $left_lastChar != "/"){
$then_separator = "/";
}
$link_d = $this->BASE_URL.$then_separator.$link_d;
}
$array_return[] = $link_d;
}
return $array_return;
}
/**
* Get IP address: ipv4 , ipv6
*/
public function ip_get(){
if (!$this->source_check()) return false;
$ret = Array();
$ipv4_pattern = '/\b(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b/';
$ipv6_pattern = '/\b\s*((([0-9A-Fa-f]{1,4}:){7}([0-9A-Fa-f]{1,4}|:))|(([0-9A-Fa-f]{1,4}:){6}(:[0-9A-Fa-f]{1,4}|((25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])(\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])){3})|:))|(([0-9A-Fa-f]{1,4}:){5}(((:[0-9A-Fa-f]{1,4}){1,2})|:((25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])(\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])){3})|:))|(([0-9A-Fa-f]{1,4}:){4}(((:[0-9A-Fa-f]{1,4}){1,3})|((:[0-9A-Fa-f]{1,4})?:((25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])(\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])){3}))|:))|(([0-9A-Fa-f]{1,4}:){3}(((:[0-9A-Fa-f]{1,4}){1,4})|((:[0-9A-Fa-f]{1,4}){0,2}:((25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])(\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])){3}))|:))|(([0-9A-Fa-f]{1,4}:){2}(((:[0-9A-Fa-f]{1,4}){1,5})|((:[0-9A-Fa-f]{1,4}){0,3}:((25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])(\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])){3}))|:))|(([0-9A-Fa-f]{1,4}:){1}(((:[0-9A-Fa-f]{1,4}){1,6})|((:[0-9A-Fa-f]{1,4}){0,4}:((25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])(\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])){3}))|:))|(:(((:[0-9A-Fa-f]{1,4}){1,7})|((:[0-9A-Fa-f]{1,4}){0,5}:((25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])(\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])){3}))|:)))(%.+)?\s*\b/';
if (preg_match_all($ipv4_pattern, $this->URL_SOURCECODE, $matches, PREG_OFFSET_CAPTURE)) {
}elseif (preg_match_all($ipv6_pattern, $this->URL_SOURCECODE, $matches, PREG_OFFSET_CAPTURE)) {
}
if (isset($matches[0])){
foreach ($matches[0] as $aV){
$ret[] = $aV[0];
}
}
return $ret;
}
/**
* Get an ordered array of most important keywords of the a text, size 1 and size 2
*/
public function keywords_text($t, $size=1){
$a_total_words = Array();
$a_total_words_two_c = Array();
$content = html_entity_decode($t);
//$content = utf8_encode($content);
$content = strip_tags($content);
$a_content = explode(" ", $content);
foreach ($a_content as $aK => $aW){
$aW = strtolower($aW);
$aW = str_replace( Array(",", ".", ":"), "", $aW);
$aW = trim($aW);
$key_len = strlen($aW);
$a_total_words_two_c[] = $aW; // save for the two keys
if ($key_len > 2){
if (!isset($a_total_words[$aW])){
$a_total_words[$aW] = 1;
}else{
$a_total_words[$aW]++;
}
}
}
asort($a_total_words);
$a_total_words = array_reverse($a_total_words);
// return result
if ($size==2){
$a_final_two_phrases = Array();
foreach ($a_total_words_two_c as $aKKK => $aVVV){
if (isset($a_total_words_two_c[$aKKK+1])){
$str_two = $a_total_words_two_c[$aKKK]." ".$a_total_words_two_c[$aKKK+1];
if (str_replace(" ", "", $str_two) !== ""){
if (!isset($a_final_two_phrases[$str_two])){
$a_final_two_phrases[$str_two] = 1;
}else{
$a_final_two_phrases[$str_two]++;
}
}
}
}
asort($a_final_two_phrases);
$a_final_two_phrases = array_reverse($a_final_two_phrases);
return $a_final_two_phrases;
}else if ($size==1){
return $a_total_words;
}
}
}
?>