diff options
author | Kate Butler <kate@innocraft.com> | 2019-05-03 03:33:59 +0300 |
---|---|---|
committer | Thomas Steur <tsteur@users.noreply.github.com> | 2019-05-03 03:33:59 +0300 |
commit | b00014475cd0a53c1d7bb6452b04605b7fdc34d0 (patch) | |
tree | 91c7d08a6c774dbc7d06153950de2ff383cd5a13 /core | |
parent | 8f1f8ec8bd9a5a4cd651fdcb0a50081d58cd38ab (diff) |
Task to purge archives for deleted websites and segments (#14317)
* Purge archives for deleted sites and segments
* Purge archives for deleted sites and segments
* Add new purgeOrphanedArchives task to expected list
* Fix build
* PR improvements
* Fix consistency of method names
* Fix typo
* Unit tests for getSegmentHashesByIdSite
* PR changes
* add note on how to test the command
* minor tweak to make sure no injections are possible
Diffstat (limited to 'core')
-rw-r--r-- | core/Archive/ArchivePurger.php | 73 | ||||
-rw-r--r-- | core/DataAccess/Model.php | 69 | ||||
-rw-r--r-- | core/Segment.php | 10 |
3 files changed, 149 insertions, 3 deletions
diff --git a/core/Archive/ArchivePurger.php b/core/Archive/ArchivePurger.php index 078203cb8f..c97a893f83 100644 --- a/core/Archive/ArchivePurger.php +++ b/core/Archive/ArchivePurger.php @@ -9,11 +9,13 @@ namespace Piwik\Archive; use Piwik\ArchiveProcessor\Rules; +use Piwik\Common; use Piwik\Config; use Piwik\Container\StaticContainer; use Piwik\DataAccess\ArchiveTableCreator; use Piwik\DataAccess\Model; use Piwik\Date; +use Piwik\Db; use Piwik\Piwik; use Psr\Log\LoggerInterface; use Psr\Log\LogLevel; @@ -154,6 +156,77 @@ class ArchivePurger return $deletedRowCount; } + public function purgeDeletedSiteArchives(Date $dateStart) + { + $idArchivesToDelete = $this->getDeletedSiteArchiveIds($dateStart); + + return $this->purge($idArchivesToDelete, $dateStart, 'deleted sites'); + } + + /** + * @param Date $dateStart + * @param array $segmentHashesByIdSite List of valid segment hashes, indexed by site ID + * @return int + */ + public function purgeDeletedSegmentArchives(Date $dateStart, array $segmentHashesByIdSite) + { + $idArchivesToDelete = $this->getDeletedSegmentArchiveIds($dateStart, $segmentHashesByIdSite); + + return $this->purge($idArchivesToDelete, $dateStart, 'deleted segments'); + } + + /** + * Purge all numeric and blob archives with the given IDs from the database. + * @param array $idArchivesToDelete + * @param Date $dateStart + * @param string $reason + * @return int + */ + protected function purge(array $idArchivesToDelete, Date $dateStart, $reason) + { + $deletedRowCount = 0; + if (!empty($idArchivesToDelete)) { + $deletedRowCount = $this->deleteArchiveIds($dateStart, $idArchivesToDelete); + + $this->logger->info( + "Deleted {count} rows in archive tables (numeric + blob) for {reason} for {date}.", + array( + 'count' => $deletedRowCount, + 'date' => $dateStart, + 'reason' => $reason + ) + ); + + $this->logger->debug("[Deleted IDs: {deletedIds}]", array( + 'deletedIds' => implode(',', $idArchivesToDelete) + )); + } else { + $this->logger->debug( + "No archives for {reason} found in archive numeric table for {date}.", + array('date' => $dateStart, 'reason' => $reason) + ); + } + + return $deletedRowCount; + } + + protected function getDeletedSiteArchiveIds(Date $date) + { + $archiveTable = ArchiveTableCreator::getNumericTable($date); + return $this->model->getArchiveIdsForDeletedSites( + $archiveTable, + $this->getOldestTemporaryArchiveToKeepThreshold() + ); + } + + protected function getDeletedSegmentArchiveIds(Date $date, array $segmentHashesByIdSite) + { + $archiveTable = ArchiveTableCreator::getNumericTable($date); + return $this->model->getArchiveIdsForDeletedSegments( + $archiveTable, $segmentHashesByIdSite, $this->getOldestTemporaryArchiveToKeepThreshold() + ); + } + protected function getOutdatedArchiveIds(Date $date, $purgeArchivesOlderThan) { $archiveTable = ArchiveTableCreator::getNumericTable($date); diff --git a/core/DataAccess/Model.php b/core/DataAccess/Model.php index d5da1e33ed..fe8c26d598 100644 --- a/core/DataAccess/Model.php +++ b/core/DataAccess/Model.php @@ -334,6 +334,75 @@ class Model } /** + * Get a list of IDs of archives that don't have any matching rows in the site table. Excludes temporary archives + * that may still be in use, as specified by the $oldestToKeep passed in. + * @param string $archiveTableName + * @param string $oldestToKeep Datetime string + * @return array of IDs + */ + public function getArchiveIdsForDeletedSites($archiveTableName, $oldestToKeep) + { + $sql = "SELECT DISTINCT idarchive FROM " . $archiveTableName . " a " + . " LEFT JOIN " . Common::prefixTable('site') . " s USING (idsite)" + . " WHERE s.idsite IS NULL" + . " AND ts_archived < ?"; + + $rows = Db::fetchAll($sql, array($oldestToKeep)); + + return array_column($rows, 'idarchive'); + } + + /** + * Get a list of IDs of archives with segments that no longer exist in the DB. Excludes temporary archives that + * may still be in use, as specified by the $oldestToKeep passed in. + * @param string $archiveTableName + * @param array $segmentHashesById Whitelist of existing segments, indexed by site ID + * @param string $oldestToKeep Datetime string + * @return array With keys idarchive, name, idsite + */ + public function getArchiveIdsForDeletedSegments($archiveTableName, array $segmentHashesById, $oldestToKeep) + { + $validSegmentClauses = []; + + foreach ($segmentHashesById as $idSite => $segments) { + // segments are md5 hashes and such not a problem re sql injection. for performance etc we don't want to use + // bound parameters for the query + foreach ($segments as $segment) { + if (!ctype_xdigit($segment)) { + throw new Exception($segment . ' expected to be an md5 hash'); + } + } + + // Special case as idsite=0 means the segments are not site-specific + if ($idSite === 0) { + foreach ($segments as $segmentHash) { + $validSegmentClauses[] = '(name LIKE "done' . $segmentHash . '%")'; + } + continue; + } + + $idSite = (int)$idSite; + + // Vanilla case - segments that are valid for a single site only + $sql = '(idsite = ' . $idSite . ' AND ('; + $sql .= 'name LIKE "done' . implode('%" OR name LIKE "done', $segments) . '%"'; + $sql .= '))'; + $validSegmentClauses[] = $sql; + } + + $isValidSegmentSql = implode(' OR ', $validSegmentClauses); + + $sql = 'SELECT idarchive FROM ' . $archiveTableName + . ' WHERE name LIKE "done%" AND name != "done"' + . ' AND ts_archived < ?' + . ' AND NOT (' . $isValidSegmentSql . ')'; + + $rows = Db::fetchAll($sql, array($oldestToKeep)); + + return array_map(function($row) { return $row['idarchive']; }, $rows); + } + + /** * Returns the SQL condition used to find successfully completed archives that * this instance is querying for. */ diff --git a/core/Segment.php b/core/Segment.php index 8fcef9c932..456172e5d0 100644 --- a/core/Segment.php +++ b/core/Segment.php @@ -307,9 +307,13 @@ class Segment if (empty($this->string)) { return ''; } - // normalize the string as browsers may send slightly different payloads for the same archive - $normalizedSegmentString = urldecode($this->string); - return md5($normalizedSegmentString); + return self::getSegmentHash($this->string); + } + + public static function getSegmentHash($definition) + { + // urldecode to normalize the string, as browsers may send slightly different payloads for the same archive + return md5(urldecode($definition)); } /** |