Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/matomo-org/matomo.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/core
diff options
context:
space:
mode:
authorKate Butler <kate@innocraft.com>2019-05-03 03:33:59 +0300
committerThomas Steur <tsteur@users.noreply.github.com>2019-05-03 03:33:59 +0300
commitb00014475cd0a53c1d7bb6452b04605b7fdc34d0 (patch)
tree91c7d08a6c774dbc7d06153950de2ff383cd5a13 /core
parent8f1f8ec8bd9a5a4cd651fdcb0a50081d58cd38ab (diff)
Task to purge archives for deleted websites and segments (#14317)
* Purge archives for deleted sites and segments * Purge archives for deleted sites and segments * Add new purgeOrphanedArchives task to expected list * Fix build * PR improvements * Fix consistency of method names * Fix typo * Unit tests for getSegmentHashesByIdSite * PR changes * add note on how to test the command * minor tweak to make sure no injections are possible
Diffstat (limited to 'core')
-rw-r--r--core/Archive/ArchivePurger.php73
-rw-r--r--core/DataAccess/Model.php69
-rw-r--r--core/Segment.php10
3 files changed, 149 insertions, 3 deletions
diff --git a/core/Archive/ArchivePurger.php b/core/Archive/ArchivePurger.php
index 078203cb8f..c97a893f83 100644
--- a/core/Archive/ArchivePurger.php
+++ b/core/Archive/ArchivePurger.php
@@ -9,11 +9,13 @@
namespace Piwik\Archive;
use Piwik\ArchiveProcessor\Rules;
+use Piwik\Common;
use Piwik\Config;
use Piwik\Container\StaticContainer;
use Piwik\DataAccess\ArchiveTableCreator;
use Piwik\DataAccess\Model;
use Piwik\Date;
+use Piwik\Db;
use Piwik\Piwik;
use Psr\Log\LoggerInterface;
use Psr\Log\LogLevel;
@@ -154,6 +156,77 @@ class ArchivePurger
return $deletedRowCount;
}
+ public function purgeDeletedSiteArchives(Date $dateStart)
+ {
+ $idArchivesToDelete = $this->getDeletedSiteArchiveIds($dateStart);
+
+ return $this->purge($idArchivesToDelete, $dateStart, 'deleted sites');
+ }
+
+ /**
+ * @param Date $dateStart
+ * @param array $segmentHashesByIdSite List of valid segment hashes, indexed by site ID
+ * @return int
+ */
+ public function purgeDeletedSegmentArchives(Date $dateStart, array $segmentHashesByIdSite)
+ {
+ $idArchivesToDelete = $this->getDeletedSegmentArchiveIds($dateStart, $segmentHashesByIdSite);
+
+ return $this->purge($idArchivesToDelete, $dateStart, 'deleted segments');
+ }
+
+ /**
+ * Purge all numeric and blob archives with the given IDs from the database.
+ * @param array $idArchivesToDelete
+ * @param Date $dateStart
+ * @param string $reason
+ * @return int
+ */
+ protected function purge(array $idArchivesToDelete, Date $dateStart, $reason)
+ {
+ $deletedRowCount = 0;
+ if (!empty($idArchivesToDelete)) {
+ $deletedRowCount = $this->deleteArchiveIds($dateStart, $idArchivesToDelete);
+
+ $this->logger->info(
+ "Deleted {count} rows in archive tables (numeric + blob) for {reason} for {date}.",
+ array(
+ 'count' => $deletedRowCount,
+ 'date' => $dateStart,
+ 'reason' => $reason
+ )
+ );
+
+ $this->logger->debug("[Deleted IDs: {deletedIds}]", array(
+ 'deletedIds' => implode(',', $idArchivesToDelete)
+ ));
+ } else {
+ $this->logger->debug(
+ "No archives for {reason} found in archive numeric table for {date}.",
+ array('date' => $dateStart, 'reason' => $reason)
+ );
+ }
+
+ return $deletedRowCount;
+ }
+
+ protected function getDeletedSiteArchiveIds(Date $date)
+ {
+ $archiveTable = ArchiveTableCreator::getNumericTable($date);
+ return $this->model->getArchiveIdsForDeletedSites(
+ $archiveTable,
+ $this->getOldestTemporaryArchiveToKeepThreshold()
+ );
+ }
+
+ protected function getDeletedSegmentArchiveIds(Date $date, array $segmentHashesByIdSite)
+ {
+ $archiveTable = ArchiveTableCreator::getNumericTable($date);
+ return $this->model->getArchiveIdsForDeletedSegments(
+ $archiveTable, $segmentHashesByIdSite, $this->getOldestTemporaryArchiveToKeepThreshold()
+ );
+ }
+
protected function getOutdatedArchiveIds(Date $date, $purgeArchivesOlderThan)
{
$archiveTable = ArchiveTableCreator::getNumericTable($date);
diff --git a/core/DataAccess/Model.php b/core/DataAccess/Model.php
index d5da1e33ed..fe8c26d598 100644
--- a/core/DataAccess/Model.php
+++ b/core/DataAccess/Model.php
@@ -334,6 +334,75 @@ class Model
}
/**
+ * Get a list of IDs of archives that don't have any matching rows in the site table. Excludes temporary archives
+ * that may still be in use, as specified by the $oldestToKeep passed in.
+ * @param string $archiveTableName
+ * @param string $oldestToKeep Datetime string
+ * @return array of IDs
+ */
+ public function getArchiveIdsForDeletedSites($archiveTableName, $oldestToKeep)
+ {
+ $sql = "SELECT DISTINCT idarchive FROM " . $archiveTableName . " a "
+ . " LEFT JOIN " . Common::prefixTable('site') . " s USING (idsite)"
+ . " WHERE s.idsite IS NULL"
+ . " AND ts_archived < ?";
+
+ $rows = Db::fetchAll($sql, array($oldestToKeep));
+
+ return array_column($rows, 'idarchive');
+ }
+
+ /**
+ * Get a list of IDs of archives with segments that no longer exist in the DB. Excludes temporary archives that
+ * may still be in use, as specified by the $oldestToKeep passed in.
+ * @param string $archiveTableName
+ * @param array $segmentHashesById Whitelist of existing segments, indexed by site ID
+ * @param string $oldestToKeep Datetime string
+ * @return array With keys idarchive, name, idsite
+ */
+ public function getArchiveIdsForDeletedSegments($archiveTableName, array $segmentHashesById, $oldestToKeep)
+ {
+ $validSegmentClauses = [];
+
+ foreach ($segmentHashesById as $idSite => $segments) {
+ // segments are md5 hashes and such not a problem re sql injection. for performance etc we don't want to use
+ // bound parameters for the query
+ foreach ($segments as $segment) {
+ if (!ctype_xdigit($segment)) {
+ throw new Exception($segment . ' expected to be an md5 hash');
+ }
+ }
+
+ // Special case as idsite=0 means the segments are not site-specific
+ if ($idSite === 0) {
+ foreach ($segments as $segmentHash) {
+ $validSegmentClauses[] = '(name LIKE "done' . $segmentHash . '%")';
+ }
+ continue;
+ }
+
+ $idSite = (int)$idSite;
+
+ // Vanilla case - segments that are valid for a single site only
+ $sql = '(idsite = ' . $idSite . ' AND (';
+ $sql .= 'name LIKE "done' . implode('%" OR name LIKE "done', $segments) . '%"';
+ $sql .= '))';
+ $validSegmentClauses[] = $sql;
+ }
+
+ $isValidSegmentSql = implode(' OR ', $validSegmentClauses);
+
+ $sql = 'SELECT idarchive FROM ' . $archiveTableName
+ . ' WHERE name LIKE "done%" AND name != "done"'
+ . ' AND ts_archived < ?'
+ . ' AND NOT (' . $isValidSegmentSql . ')';
+
+ $rows = Db::fetchAll($sql, array($oldestToKeep));
+
+ return array_map(function($row) { return $row['idarchive']; }, $rows);
+ }
+
+ /**
* Returns the SQL condition used to find successfully completed archives that
* this instance is querying for.
*/
diff --git a/core/Segment.php b/core/Segment.php
index 8fcef9c932..456172e5d0 100644
--- a/core/Segment.php
+++ b/core/Segment.php
@@ -307,9 +307,13 @@ class Segment
if (empty($this->string)) {
return '';
}
- // normalize the string as browsers may send slightly different payloads for the same archive
- $normalizedSegmentString = urldecode($this->string);
- return md5($normalizedSegmentString);
+ return self::getSegmentHash($this->string);
+ }
+
+ public static function getSegmentHash($definition)
+ {
+ // urldecode to normalize the string, as browsers may send slightly different payloads for the same archive
+ return md5(urldecode($definition));
}
/**