0.5 (May 07, 2006): - Renamed the project from webSQL to htmlSQL because webSQL already exists - Added more error checks - Added the convert_tagname_to_key function and fixed a few issues 0.1 -> 0.4 (April 2006): - Created main parts of the library */ class htmlsql { // configuration: // htmlSQL version: var $version = '0.5'; // referer and user agent: var $referer = ''; var $user_agent = 'htmlSQL/0.5'; // these are filled on runtime: // (don't touch them) // holds snoopy object: var $snoopy = NULL; // the results array is stored in here: var $results = array(); // the results objects are stored in here: var $results_objects = NULL; // the error message gets stored in here: var $error = ''; // the downloaded page is stored in here: var $page = ''; /* ** init_snoopy ** ** initializes the snoopy class */ function init_snoopy(){ $this->snoopy = new Snoopy(); $this->snoopy->agent = $this->user_agent; $this->snoopy->referer = $this->referer; } /* ** set_user_agent ** ** set a custom user agent */ function set_user_agent($u){ $this->user_agent = $u; } /* ** set_referer ** ** sets the referer */ function set_referer($r){ $this->referer = $r; } /* ** _get_between ** ** returns the content between $start and $end */ function _get_between($content,$start,$end){ $r = explode($start, $content); if (isset($r[1])){ $r = explode($end, $r[1]); return $r[0]; } return ''; } /* ** connect ** ** connects to a data source (url, file or string) */ function connect($type, $resource){ if ($type == 'url'){ return $this->_fetch_url($resource); } else if ($type == 'file') { if (!file_exists($resource)){ $this->error = 'The given file "'.$resource.' does not exist!'; return false; } $this->page = file_get_contents($resource); return true; } else if ($type == 'string') { $this->page = $resource; return true; } return false; } /* ** _fetch_url ** ** downloads the given URL with snoopy */ function _fetch_url($url){ $parsed_url = parse_url($url); if (!isset($parsed_url['scheme']) or $parsed_url['scheme'] != 'http'){ $this->error = 'Unsupported URL sheme given, please just use "HTTP".'; return false; } if (!isset($parsed_url['host']) or $parsed_url['host'] == ''){ $this->error = 'Invalid URL given!'; return false; } $host = $parsed_url['host']; $host .= (isset($parsed_url['port']) and !empty($parsed_url['port'])) ? ':'.$parsed_url['port'] : ''; $path = (isset($parsed_url['path']) and !empty($parsed_url['path'])) ? $parsed_url['path'] : '/'; $path .= (isset($parsed_url['query']) and !empty($parsed_url['query'])) ? '?'.$parsed_url['query'] : ''; $url = 'http://' . $host . $path; $this->init_snoopy(); if($this->snoopy->fetch($url)){ $this->page = $this->snoopy->results; // empty buffer: $this->snoopy->results = ''; } else { $this->error = 'Could not establish a connection to the given URL!'; return false; } return true; } /* ** _extract_all_tags ** ** */ function _extract_all_tags($html, &$tag_names, &$tag_attributes, &$tag_values, $depth=0){ // stop endless loops -> ugly... if ($depth > 99999) return; preg_match_all('/<([a-z0-9\-]+)(.*?)>((.*?)<\/\1>)?/is', $html, $m); if (count($m[0]) != 0){ for ($t=0; $t < count($m[0]); $t++){ $tag_names[] = trim($m[1][$t]); $tag_attributes[] = trim($m[2][$t]); $tag_values[] = trim($m[4][$t]); // go deeper: if (trim($m[4][$t]) != '' and preg_match('/<[a-z0-9\-]+.*?>/is', $m[4][$t])){ $this->_extract_all_tags($m[4][$t], $tag_names, $tag_attributes, $tag_values, $depth+1); } } } } /* ** isolate_content ** ** isolates the content to a specific part */ function isolate_content($start,$end){ $this->page = $this->_get_between($this->page, $start, $end); } /* ** select ** ** restricts the content of a specific tag */ function select($tagname, $num=0){ $num++; if ($tagname != ''){ preg_match('/<'.$tagname.'.*?>(.*?)<\/'.$tagname.'>/is', $this->page, $m); if (isset($m[$num]) and !empty($m[$num])){ $this->page = $m[$num]; } else { $this->error = 'Could not select tag: "'.$tagname.'('.$num.')"!'; return false; } } return true; } /* ** get_content ** ** returns the content of an request */ function get_content(){ return $this->page; } /* ** _clean_array ** ** */ function _clean_array($arr){ $new = array(); for ($x=0; $x < count($arr); $x++){ $arr[$x] = trim($arr[$x]); if ($arr[$x] != ''){ $new[] = $arr[$x]; } } return $new; } /* ** _test_tag ** ** */ function _test_tag($tag_attributes, $if_term){ preg_match_all('/\$([a-z0-9_\-]+)/i', $if_term, $m); if (isset($m[1])){ for ($x=0; $x < count($m[1]); $x++){ $varname = $m[1][$x]; $$varname = ''; } } $new_list = array(); while (list($k,$v) = each($tag_attributes)){ $k = preg_replace('/[^a-z0-9_\-]/i', '', $k); if ($k != ''){ $new_list[$k] = $v; } } unset($tag_attributes); extract($new_list); $r = false; if (@eval('$r = ('.$if_term.');') === false){ $this->error = 'The WHERE statement is invalid (eval() failed)!'; return false; } return $r; } /* ** _match_tags ** ** */ function _match_tags(&$results, &$return_values, &$where_term, &$tag_attributes, &$tag_values, &$tag_names){ $search_mode = ''; $search_attribute = ''; $search_term = ''; /* ** parse: ** ** href LIKE ".htm" ** class = "foo" */ $where_term = trim($where_term); $search_mode = ($where_term == '') ? 'match_all' : 'eval'; for ($x=0; $x < count($tag_attributes); $x++){ $tag_attributes[$x] = $this->parse_attributes($tag_attributes[$x]); if (is_array($tag_names)){ $tag_attributes[$x]['tagname'] = isset($tag_names[$x]) ? $tag_names[$x] : ''; } else { $tag_attributes[$x]['tagname'] = $tag_names; } // string $tag_attributes[$x]['text'] = isset($tag_values[$x]) ? $tag_values[$x] : ''; if ($search_mode == 'eval'){ if ($this->_test_tag($tag_attributes[$x], $where_term)){ $this->_add_result($results, $return_values, $tag_attributes[$x]); } } else if ($search_mode == 'match_all'){ $this->_add_result($results, $return_values, $tag_attributes[$x]); } } } /* ** query ** ** performs a query */ function query($term){ // query results are stored in here: $results = array(); $this->results = NULL; $this->results_objects = NULL; $term = trim($term); if ($term == ''){ $this->error = 'Empty query given!'; return false; } // match query: preg_match('/^SELECT (.*?) FROM (.*)$/i', $term, $m); // parse returns values // SELECT * FROM ... // SELECT foo,bar FROM ... $return_values = isset($m[1]) ? trim($m[1]) : '*'; if ($return_values != '*'){ $return_values = explode(',', strtolower($return_values)); $return_values = $this->_clean_array($return_values); } // match from and where part: // // ... FROM * WHERE $id=="one" // ... FROM a WHERE $class=="red" // ... FROM a // ... FROM * $last = isset($m[2]) ? trim($m[2]) : ''; $search_term = ''; $where_term = ''; if (preg_match('/^(.*?) WHERE (.*?)$/i', $last, $m)){ $search_term = trim($m[1]); $where_term = trim($m[2]); } else { $search_term = $last; } // find tags if ($search_term == '*'){ // search all $tag_names = array(); $tag_attributes = array(); $tag_values = array(); $html = $this->page; $this->_extract_all_tags($html, $tag_names, $tag_attributes, $tag_values); $this->_match_tags($results, $return_values, $where_term, $tag_attributes, $tag_values, $tag_names); } else { // search term is a tag $tagname = trim($search_term); $tag_attributes = array(); $tag_values = array(); $regexp = '<'.$tagname.'([ \t].*?|)>((.*?)<\/'.$tagname.'>)?'; preg_match_all('/'.$regexp.'/is', $this->page, $m); if (count($m[0]) != 0){ $tag_attributes = $m[1]; $tag_values = $m[3]; } $this->_match_tags($results, $return_values, $where_term, $tag_attributes, $tag_values, $tagname); } $this->results = $results; // was there a error during the search process? return ($this->error == ''); } /* ** convert_tagname_to_key ** ** converts the tagname to the array key */ function convert_tagname_to_key(){ $new_array = array(); $tag_name = ''; while(list($key,$val) = each($this->results)){ if (isset($val['tagname'])){ $tag_name = $val['tagname']; unset($val['tagname']); } else { $tag_name = '(empty)'; } $new_array[$tag_name] = $val; } $this->results = $new_array; } /* ** fetch_array ** ** returns the results as an array */ function fetch_array(){ return $this->results; } /* ** _array2object ** ** converts an array to an object */ function _array2object($array) { if (is_array($array)) { $obj = new StdClass(); foreach ($array as $key => $val){ $obj->$key = $val; } } else { $obj = $array; } return $obj; } /* ** fetch_objects ** ** returns the results as objects */ function fetch_objects(){ if ($this->results_objects == NULL){ $results = array(); reset($this->results); while(list($key,$val) = each($this->results)){ $results[$key] = $this->_array2object($val); } $this->results_objects = $results; } return $this->results_objects; } /* ** get_result_count ** ** returns the number of results */ function get_result_count(){ return count($this->results); } /* ** _add_result ** ** */ function _add_result(&$results, $return_values, $tag_attributes){ if ($return_values == '*'){ $results[] = $tag_attributes; } else if (is_array($return_values)){ $new_result = array(); reset($return_values); for ($t=0; $t < count($return_values); $t++){ $_tagname = explode(' as ', $return_values[$t]); $_caption = $return_values[$t]; if (count($_tagname) != 1){ $_caption = trim($_tagname[1]); $_tagname = trim($_tagname[0]); } else { $_tagname = $_caption; } $new_result[$_caption] = isset($tag_attributes[$_tagname]) ? $tag_attributes[$_tagname] : ''; } $results[] = $new_result; } } /* ** parse_attributes ** ** parses HTML attributes and returns an array */ function parse_attributes($attrib){ $attrib .= '>'; $mode = 'search_key'; $tmp = ''; $current_key = ''; $attributes = array(); for ($x=0; $x < strlen($attrib); $x++){ $char = $attrib[$x]; if ($char == '=' and $mode == 'search_key'){ $current_key = trim($tmp); $tmp = ''; $mode = 'value'; } else if ($mode == 'search_key' and preg_match('/[ \t\s\r\n>]/', $char)){ $current_key = strtolower(trim($tmp)); if ($current_key != ''){ $attributes[$current_key] = ''; } $tmp = ''; $current_key = ''; } else if ($mode == 'value' and $char == '"'){ $mode = 'find_value_ending_a'; } else if ($mode == 'value' and $char == '\''){ $mode = 'find_value_ending_b'; } else if ($mode == 'value'){ $tmp .= $char; $mode = 'find_value_ending_c'; } else if ( ($mode == 'find_value_ending_a' and $char == '"') or ($mode == 'find_value_ending_b' and $char == '\'') or ($mode == 'find_value_ending_c' and preg_match('/[ \t\s\r\n>]/', $char)) ){ $mode = 'search_key'; if ($current_key != ''){ $current_key = strtolower($current_key); $attributes[$current_key] = $tmp; } $tmp = ''; } else { $tmp .= $char; } } if ($mode != 'search_key' and $current_key != ''){ $current_key = strtolower($current_key); $attributes[$current_key] = trim(preg_replace('/>+$/', '', $tmp)); } return $attributes; } } ?>