From 2e9aaa9496d38b586d9495c0a39473328630d903 Mon Sep 17 00:00:00 2001 From: sgiehl Date: Sat, 31 Oct 2015 19:51:52 +0100 Subject: moved definitions of search engines to new repo and converted them to better readable yml format --- core/UrlHelper.php | 236 ----------------------------------------------------- 1 file changed, 236 deletions(-) (limited to 'core/UrlHelper.php') diff --git a/core/UrlHelper.php b/core/UrlHelper.php index 4a0ac0fa0a..66a0e64e25 100644 --- a/core/UrlHelper.php +++ b/core/UrlHelper.php @@ -258,242 +258,6 @@ class UrlHelper return $result; } - /** - * Extracts a keyword from a raw not encoded URL. - * Will only extract keyword if a known search engine has been detected. - * Returns the keyword: - * - in UTF8: automatically converted from other charsets when applicable - * - strtolowered: "QUErY test!" will return "query test!" - * - trimmed: extra spaces before and after are removed - * - * Lists of supported search engines can be found in /core/DataFiles/SearchEngines.php - * The function returns false when a keyword couldn't be found. - * eg. if the url is "http://www.google.com/partners.html" this will return false, - * as the google keyword parameter couldn't be found. - * - * @see unit tests in /tests/core/Common.test.php - * @param string $referrerUrl URL referrer URL, eg. $_SERVER['HTTP_REFERER'] - * @return array|bool false if a keyword couldn't be extracted, - * or array( - * 'name' => 'Google', - * 'keywords' => 'my searched keywords') - */ - public static function extractSearchEngineInformationFromUrl($referrerUrl) - { - $referrerParsed = @parse_url($referrerUrl); - $referrerHost = ''; - if (isset($referrerParsed['host'])) { - $referrerHost = $referrerParsed['host']; - } - if (empty($referrerHost)) { - return false; - } - // some search engines (eg. Bing Images) use the same domain - // as an existing search engine (eg. Bing), we must also use the url path - $referrerPath = ''; - if (isset($referrerParsed['path'])) { - $referrerPath = $referrerParsed['path']; - } - - // no search query - if (!isset($referrerParsed['query'])) { - $referrerParsed['query'] = ''; - } - $query = $referrerParsed['query']; - - // Google Referrers URLs sometimes have the fragment which contains the keyword - if (!empty($referrerParsed['fragment'])) { - $query .= '&' . $referrerParsed['fragment']; - } - - $searchEngines = Common::getSearchEngineUrls(); - - $hostPattern = self::getLossyUrl($referrerHost); - /* - * Try to get the best matching 'host' in definitions - * 1. check if host + path matches an definition - * 2. check if host only matches - * 3. check if host pattern + path matches - * 4. check if host pattern matches - * 5. special handling - */ - if (array_key_exists($referrerHost . $referrerPath, $searchEngines)) { - $referrerHost = $referrerHost . $referrerPath; - } elseif (array_key_exists($referrerHost, $searchEngines)) { - // no need to change host - } elseif (array_key_exists($hostPattern . $referrerPath, $searchEngines)) { - $referrerHost = $hostPattern . $referrerPath; - } elseif (array_key_exists($hostPattern, $searchEngines)) { - $referrerHost = $hostPattern; - } elseif (!array_key_exists($referrerHost, $searchEngines)) { - if (!strncmp($query, 'cx=partner-pub-', 15)) { - // Google custom search engine - $referrerHost = 'google.com/cse'; - } elseif (!strncmp($referrerPath, '/pemonitorhosted/ws/results/', 28)) { - // private-label search powered by InfoSpace Metasearch - $referrerHost = 'wsdsold.infospace.com'; - } elseif (strpos($referrerHost, '.images.search.yahoo.com') != false) { - // Yahoo! Images - $referrerHost = 'images.search.yahoo.com'; - } elseif (strpos($referrerHost, '.search.yahoo.com') != false) { - // Yahoo! - $referrerHost = 'search.yahoo.com'; - } else { - return false; - } - } - $searchEngineName = $searchEngines[$referrerHost][0]; - $variableNames = null; - if (isset($searchEngines[$referrerHost][1])) { - $variableNames = $searchEngines[$referrerHost][1]; - } - if (!$variableNames) { - $searchEngineNames = Common::getSearchEngineNames(); - $url = $searchEngineNames[$searchEngineName]; - $variableNames = $searchEngines[$url][1]; - } - if (!is_array($variableNames)) { - $variableNames = array($variableNames); - } - - $key = null; - if ($searchEngineName === 'Google Images' - || ($searchEngineName === 'Google' && strpos($referrerUrl, '/imgres') !== false) - ) { - if (strpos($query, '&prev') !== false) { - $query = urldecode(trim(self::getParameterFromQueryString($query, 'prev'))); - $query = str_replace('&', '&', strstr($query, '?')); - } - $searchEngineName = 'Google Images'; - } elseif ($searchEngineName === 'Google' - && (strpos($query, '&as_') !== false || strpos($query, 'as_') === 0) - ) { - $keys = array(); - $key = self::getParameterFromQueryString($query, 'as_q'); - if (!empty($key)) { - array_push($keys, $key); - } - $key = self::getParameterFromQueryString($query, 'as_oq'); - if (!empty($key)) { - array_push($keys, str_replace('+', ' OR ', $key)); - } - $key = self::getParameterFromQueryString($query, 'as_epq'); - if (!empty($key)) { - array_push($keys, "\"$key\""); - } - $key = self::getParameterFromQueryString($query, 'as_eq'); - if (!empty($key)) { - array_push($keys, "-$key"); - } - $key = trim(urldecode(implode(' ', $keys))); - } - - if ($searchEngineName === 'Google') { - // top bar menu - $tbm = self::getParameterFromQueryString($query, 'tbm'); - switch ($tbm) { - case 'isch': - $searchEngineName = 'Google Images'; - break; - case 'vid': - $searchEngineName = 'Google Video'; - break; - case 'shop': - $searchEngineName = 'Google Shopping'; - break; - } - } - - if (empty($key)) { - foreach ($variableNames as $variableName) { - if ($variableName[0] == '/') { - // regular expression match - if (preg_match($variableName, $referrerUrl, $matches)) { - $key = trim(urldecode($matches[1])); - break; - } - } else { - // search for keywords now &vname=keyword - $key = self::getParameterFromQueryString($query, $variableName); - $key = trim(urldecode($key)); - - // Special cases: empty or no keywords - if (empty($key) - && ( - // Google search with no keyword - ($searchEngineName == 'Google' - && (empty($query) && (empty($referrerPath) || $referrerPath == '/') && empty($referrerParsed['fragment'])) - ) - - // Yahoo search with no keyword - || ($searchEngineName == 'Yahoo!' - && ($referrerParsed['host'] == 'r.search.yahoo.com') - ) - - // empty keyword parameter - || strpos($query, sprintf('&%s=', $variableName)) !== false - || strpos($query, sprintf('?%s=', $variableName)) !== false - - // search engines with no keyword - || $searchEngineName == 'Ixquick' - || $searchEngineName == 'Google Images' - || $searchEngineName == 'DuckDuckGo') - ) { - $key = false; - } - if (!empty($key) - || $key === false - ) { - break; - } - } - } - } - - // $key === false is the special case "No keyword provided" which is a Search engine match - if ($key === null - || $key === '' - ) { - return false; - } - - if (!empty($key)) { - if (function_exists('iconv') - && isset($searchEngines[$referrerHost][3]) - ) { - // accepts string, array, or comma-separated list string in preferred order - $charsets = $searchEngines[$referrerHost][3]; - if (!is_array($charsets)) { - $charsets = explode(',', $charsets); - } - - if (!empty($charsets)) { - $charset = $charsets[0]; - if (count($charsets) > 1 - && function_exists('mb_detect_encoding') - ) { - $charset = mb_detect_encoding($key, $charsets); - if ($charset === false) { - $charset = $charsets[0]; - } - } - - $newkey = @iconv($charset, 'UTF-8//IGNORE', $key); - if (!empty($newkey)) { - $key = $newkey; - } - } - } - - $key = Common::mb_strtolower($key); - } - - return array( - 'name' => $searchEngineName, - 'keywords' => $key, - ); - } - /** * Returns the query part from any valid url and adds additional parameters to the query part if needed. * -- cgit v1.2.3