diff options
author | diosmosis <diosmosis@users.noreply.github.com> | 2020-01-08 11:05:41 +0300 |
---|---|---|
committer | Matthieu Aubry <mattab@users.noreply.github.com> | 2020-01-08 11:05:41 +0300 |
commit | 3c2b5f714f85457796e3e190781d3e592822ebf7 (patch) | |
tree | 5e7049fd0bac597481bfe70f972602af5c0220b9 | |
parent | 18b0815a78d158b45afbd752c7e21581321acb4c (diff) |
Do not proceed with archiving if a valid archive exists for the parameter combination. (#14937)
* Do not proceed with archiving if a valid archive exists for the parameter combination.
* test manually and get to work
* Apply some review feedback.
* apply more review feedback
* Do not abort archiving if archive found for non-day period so segments will still archive.
* Apply more review feedback.
* Make new optimization aware of whole periods.
* update submodule
* In CronArchive modify lastN date parameter to use oldest invalidated archive.
* Move date changing logic to method and unit test.
* Fix test.
-rw-r--r-- | core/ArchiveProcessor/Rules.php | 8 | ||||
-rw-r--r-- | core/Archiver/Request.php | 23 | ||||
-rw-r--r-- | core/CronArchive.php | 214 | ||||
-rw-r--r-- | core/DataAccess/ArchiveSelector.php | 14 | ||||
-rw-r--r-- | tests/PHPUnit/Integration/CronArchiveTest.php | 14 | ||||
-rw-r--r-- | tests/PHPUnit/Unit/Archiver/RequestTest.php | 47 |
6 files changed, 252 insertions, 68 deletions
diff --git a/core/ArchiveProcessor/Rules.php b/core/ArchiveProcessor/Rules.php index f054ce5d74..bf0f73b836 100644 --- a/core/ArchiveProcessor/Rules.php +++ b/core/ArchiveProcessor/Rules.php @@ -291,13 +291,15 @@ class Rules /** * Returns done flag values allowed to be selected * - * @return string + * @return string[] */ - public static function getSelectableDoneFlagValues() + public static function getSelectableDoneFlagValues($includeInvalidated = true) { $possibleValues = array(ArchiveWriter::DONE_OK, ArchiveWriter::DONE_OK_TEMPORARY); - if (!Rules::isRequestAuthorizedToArchive()) { + if (!Rules::isRequestAuthorizedToArchive() + && $includeInvalidated + ) { //If request is not authorized to archive then fetch also invalidated archives $possibleValues[] = ArchiveWriter::DONE_INVALIDATED; } diff --git a/core/Archiver/Request.php b/core/Archiver/Request.php index 4658366a2c..e08b23ac54 100644 --- a/core/Archiver/Request.php +++ b/core/Archiver/Request.php @@ -50,4 +50,27 @@ class Request { return $this->url; } + + /** + * @return string + */ + public function getUrl() + { + return $this->url; + } + + /** + * @param string $url + */ + public function setUrl($url) + { + $this->url = $url; + } + + public function changeDate($newDate) + { + $url = $this->getUrl(); + $url = preg_replace('/([&?])date=[^&]*/', '$1date=' . $newDate, $url); + $this->setUrl($url); + } } diff --git a/core/CronArchive.php b/core/CronArchive.php index 83a5911c4b..8b91161f6e 100644 --- a/core/CronArchive.php +++ b/core/CronArchive.php @@ -9,6 +9,7 @@ namespace Piwik; use Exception; +use Piwik\ArchiveProcessor\Parameters; use Piwik\ArchiveProcessor\PluginsArchiver; use Piwik\ArchiveProcessor\Rules; use Piwik\Archiver\Request; @@ -18,12 +19,15 @@ use Piwik\CronArchive\FixedSiteIds; use Piwik\CronArchive\Performance\Logger; use Piwik\CronArchive\SharedSiteIds; use Piwik\Archive\ArchiveInvalidator; +use Piwik\DataAccess\ArchiveSelector; use Piwik\DataAccess\RawLogDao; use Piwik\Exception\UnexpectedWebsiteFoundException; use Piwik\Metrics\Formatter; +use Piwik\Period\Factory; use Piwik\Period\Factory as PeriodFactory; use Piwik\CronArchive\SitesToReprocessDistributedList; use Piwik\CronArchive\SegmentArchivingRequestUrlProvider; +use Piwik\Period\Range; use Piwik\Plugins\CoreAdminHome\API as CoreAdminHomeAPI; use Piwik\Plugins\SegmentEditor\Model as SegmentEditorModel; use Piwik\Plugins\SitesManager\API as APISitesManager; @@ -478,7 +482,7 @@ class CronArchive try { if ($this->isWebsiteUsingTheTracker($idSite)) { - if(!$this->hadWebsiteTrafficSinceMidnightInTimezone($idSite)) { + if (!$this->hadWebsiteTrafficSinceMidnightInTimezone($idSite)) { $this->logger->info("Skipped website id $idSite as archiving is not needed"); $this->skippedDayNoRecentData++; @@ -831,7 +835,6 @@ class CronArchive $request = "?module=API&method=API.get&idSite=$idSite&period=$period&date=" . $date . "&format=php"; if ($segment) { $request .= '&segment=' . urlencode($segment); - ; } return $request; } @@ -886,8 +889,6 @@ class CronArchive $date = $this->getApiDateParameter($idSite, "day", $processDaysSince); $url = $this->getVisitsRequestUrl($idSite, "day", $date); - $this->logArchiveWebsite($idSite, "day", $date); - $cliMulti = $this->makeCliMulti(); if ($cliMulti->isCommandAlreadyRunning($this->makeRequestUrl($url))) { $this->logger->info("Skipped website id $idSite, such a process is already in progress, " . $timerWebsite->__toString()); @@ -895,53 +896,69 @@ class CronArchive return false; } - $content = $this->request($url); - $daysResponse = Common::safe_unserialize($content); + $visitsLastDays = 0; - if (empty($content) - || !is_array($daysResponse) - || count($daysResponse) == 0 - ) { - // cancel marking the site as reprocessed - if ($websiteInvalidatedShouldReprocess) { - $store = new SitesToReprocessDistributedList(); - $store->add($idSite); - } + list($isThereArchive, $newDate) = $this->isThereAValidArchiveForPeriod($idSite, 'day', $date, $segment = ''); + if ($isThereArchive) { + $visitsToday = Archive::build($idSite, 'day', $date)->getNumeric('nb_visits'); + $visitsToday = end($visitsToday); + $visitsToday = isset($visitsToday['nb_visits']) ? $visitsToday['nb_visits'] : 0; - $this->logError("Empty or invalid response '$content' for website id $idSite, " . $timerWebsite->__toString() . ", skipping"); - $this->skippedDayOnApiError++; - $this->skipped++; - return false; - } + $this->logArchiveWebsiteSkippedValidArchiveExists($idSite, 'day', $date); + ++$this->skipped; + } else { + $date = $newDate; // use modified lastN param - $visitsToday = $this->getVisitsLastPeriodFromApiResponse($daysResponse); - $visitsLastDays = $this->getVisitsFromApiResponse($daysResponse); + $this->logArchiveWebsite($idSite, "day", $date); - $this->requests++; - $this->processed++; + $content = $this->request($url); + $daysResponse = Common::safe_unserialize($content); - $shouldArchiveWithoutVisits = PluginsArchiver::doesAnyPluginArchiveWithoutVisits(); + if (empty($content) + || !is_array($daysResponse) + || count($daysResponse) == 0 + ) { + // cancel marking the site as reprocessed + if ($websiteInvalidatedShouldReprocess) { + $store = new SitesToReprocessDistributedList(); + $store->add($idSite); + } - // If there is no visit today and we don't need to process this website, we can skip remaining archives - if ( - 0 == $visitsToday && !$shouldArchiveWithoutVisits - && !$shouldArchivePeriods - ) { - $this->logger->info("Skipped website id $idSite, no visit today, " . $timerWebsite->__toString()); - $this->skippedDayNoRecentData++; - $this->skipped++; - return false; - } + $this->logError("Empty or invalid response '$content' for website id $idSite, " . $timerWebsite->__toString() . ", skipping"); + $this->skippedDayOnApiError++; + $this->skipped++; + return false; + } - if (0 == $visitsLastDays && !$shouldArchiveWithoutVisits - && !$shouldArchivePeriods - && $this->shouldArchiveAllSites - ) { - $humanReadableDate = $this->formatReadableDateRange($date); - $this->logger->info("Skipped website id $idSite, no visits in the $humanReadableDate days, " . $timerWebsite->__toString()); - $this->skippedPeriodsNoDataInPeriod++; - $this->skipped++; - return false; + $visitsToday = $this->getVisitsLastPeriodFromApiResponse($daysResponse); + $visitsLastDays = $this->getVisitsFromApiResponse($daysResponse); + + $this->requests++; + $this->processed++; + + $shouldArchiveWithoutVisits = PluginsArchiver::doesAnyPluginArchiveWithoutVisits(); + + // If there is no visit today and we don't need to process this website, we can skip remaining archives + if ( + 0 == $visitsToday && !$shouldArchiveWithoutVisits + && !$shouldArchivePeriods + ) { + $this->logger->info("Skipped website id $idSite, no visit today, " . $timerWebsite->__toString()); + $this->skippedDayNoRecentData++; + $this->skipped++; + return false; + } + + if (0 == $visitsLastDays && !$shouldArchiveWithoutVisits + && !$shouldArchivePeriods + && $this->shouldArchiveAllSites + ) { + $humanReadableDate = $this->formatReadableDateRange($date); + $this->logger->info("Skipped website id $idSite, no visits in the $humanReadableDate days, " . $timerWebsite->__toString()); + $this->skippedPeriodsNoDataInPeriod++; + $this->skipped++; + return false; + } } $this->visitsToday += $visitsToday; @@ -955,6 +972,67 @@ class CronArchive return $dayArchiveWasSuccessful; } + private function isThereAValidArchiveForPeriod($idSite, $period, $date, $segment = '') + { + if (Range::isMultiplePeriod($date, $period)) { + $rangePeriod = Factory::build($period, $date, Site::getTimezoneFor($idSite)); + $periodsToCheck = $rangePeriod->getSubperiods(); + } else { + $periodsToCheck = [Factory::build($period, $date, Site::getTimezoneFor($idSite))]; + } + + $periodsToCheckRanges = array_map(function (Period $p) { return $p->getRangeString(); }, $periodsToCheck); + + $this->invalidateArchivedReportsForSitesThatNeedToBeArchivedAgain(); + + $archiveIds = ArchiveSelector::getArchiveIds( + [$idSite], $periodsToCheck, new Segment($segment, [$idSite]), $plugins = [], // empty plugins param since we only check for an 'all' archive + $includeInvalidated = false + ); + + $foundArchivePeriods = []; + foreach ($archiveIds as $doneFlag => $dates) { + foreach ($dates as $dateRange => $idArchives) { + $foundArchivePeriods[] = $dateRange; + } + } + + $diff = array_diff($periodsToCheckRanges, $foundArchivePeriods); + $isThereArchiveForAllPeriods = empty($diff); + + // if there is an invalidated archive within the range, find out the oldest one and how far it is from today, + // and change the lastN $date to be value so it is correctly re-processed. + $newDate = $date; + if (!$isThereArchiveForAllPeriods + && preg_match('/^last([0-9]+)/', $date, $matches) + ) { + $lastNValue = (int) $matches[1]; + + usort($diff, function ($lhs, $rhs) { + $lhsDate = explode(',', $lhs)[0]; + $rhsDate = explode(',', $rhs)[0]; + + if ($lhsDate == $rhsDate) { + return 1; + } else if (Date::factory($lhsDate)->isEarlier(Date::factory($rhsDate))) { + return -1; + } else { + return 1; + } + }); + + $oldestDateWithoutArchive = explode(',', reset($diff))[0]; + $todayInTimezone = Date::factoryInTimezone('today', Site::getTimezoneFor($idSite)); + + /** @var Range $newRangePeriod */ + $newRangePeriod = PeriodFactory::build($period, $oldestDateWithoutArchive . ',' . $todayInTimezone); + + $newDate = 'last' . min($lastNValue, $newRangePeriod->getNumberOfSubperiods()); + } + + return [$isThereArchiveForAllPeriods, $newDate]; + } + /** * @param $idSite * @return array @@ -1019,13 +1097,22 @@ class CronArchive $self = $this; $request = new Request($url); - $request->before(function () use ($self, $url, $idSite, $period, $date) { + $request->before(function () use ($self, $url, $idSite, $period, $date, $segment, $request) { if ($self->isAlreadyArchivingUrl($url, $idSite, $period, $date)) { - return Request::ABORT; + return Request::ABORT; + } + + list($isThereArchive, $newDate) = $this->isThereAValidArchiveForPeriod($idSite, $period, $date, $segment); + if ($isThereArchive) { + $this->logArchiveWebsiteSkippedValidArchiveExists($idSite, $period, $date); + return Request::ABORT; } + + $request->changeDate($newDate); + + $this->logArchiveWebsite($idSite, $period, $newDate); }); $urls[] = $request; - $this->logArchiveWebsite($idSite, $period, $date); } $segmentRequestsCount = 0; @@ -1070,6 +1157,18 @@ class CronArchive return $success; } + // TODO: need test to make sure segment archives are invalidated as well + private function logArchiveWebsiteSkippedValidArchiveExists($idSite, $period, $date, $segment = '') + { + $this->logger->info("Skipping archiving for website id = {idSite}, period = {period}, date = {date}, segment = {segment}, " + . "since there is already a valid archive (tracking a visit automatically invalidates archives).", [ + 'idSite' => $idSite, + 'period' => $period, + 'date' => $date, + 'segment' => $segment, + ]); + } + /** * Logs a section in the output * @@ -1294,7 +1393,7 @@ class CronArchive { $timezone = Site::getTimezoneFor($idSite); - $nowInTimezone = Date::factory('now', $timezone); + $nowInTimezone = Date::factoryInTimezone('now', $timezone); $midnightInTimezone = $nowInTimezone->setTime('00:00:00'); $secondsSinceMidnight = $nowInTimezone->getTimestamp() - $midnightInTimezone->getTimestamp(); @@ -1832,6 +1931,7 @@ class CronArchive $segments[] = $segment; } + $segmentCount = count($segments); $processedSegmentCount = 0; @@ -1865,18 +1965,28 @@ class CronArchive $request = new Request($urlWithSegment); $logger = $this->logger; $self = $this; - $request->before(function () use ($logger, $segment, $segmentCount, &$processedSegmentCount, $idSite, $period, $urlWithSegment, $self) { - + $request->before(function () use ($logger, $segment, $segmentCount, &$processedSegmentCount, $idSite, $period, $date, $urlWithSegment, $self, $request) { if ($self->isAlreadyArchivingSegment($urlWithSegment, $idSite, $period, $segment)) { return Request::ABORT; } + list($isThereArchive, $newDate) = $this->isThereAValidArchiveForPeriod($idSite, $period, $date, $segment); + if ($isThereArchive) { + $this->logArchiveWebsiteSkippedValidArchiveExists($idSite, $period, $date, $segment); + return Request::ABORT; + } + + $url = $request->getUrl(); + $url = preg_replace('/([&?])date=[^&]*/', '$1date=' . $newDate, $url); + $request->setUrl($url); + $processedSegmentCount++; $logger->info(sprintf( - '- pre-processing segment %d/%d %s', + '- pre-processing segment %d/%d %s [date = %s]', $processedSegmentCount, $segmentCount, - $segment + $segment, + $newDate )); }); diff --git a/core/DataAccess/ArchiveSelector.php b/core/DataAccess/ArchiveSelector.php index d8063c33c0..21d7a9f626 100644 --- a/core/DataAccess/ArchiveSelector.php +++ b/core/DataAccess/ArchiveSelector.php @@ -51,7 +51,7 @@ class ArchiveSelector * @return array|bool * @throws Exception */ - public static function getArchiveIdAndVisits(ArchiveProcessor\Parameters $params, $minDatetimeArchiveProcessedUTC = false) + public static function getArchiveIdAndVisits(ArchiveProcessor\Parameters $params, $minDatetimeArchiveProcessedUTC = false, $includeInvalidated = true) { $idSite = $params->getSite()->getId(); $period = $params->getPeriod()->getId(); @@ -71,7 +71,7 @@ class ArchiveSelector $plugins = array("VisitsSummary", $requestedPlugin); $doneFlags = Rules::getDoneFlags($plugins, $segment); - $doneFlagValues = Rules::getSelectableDoneFlagValues(); + $doneFlagValues = Rules::getSelectableDoneFlagValues($includeInvalidated); $results = self::getModel()->getArchiveIdAndVisits($numericTable, $idSite, $period, $dateStartIso, $dateEndIso, $minDatetimeIsoArchiveProcessedUTC, $doneFlags, $doneFlagValues); @@ -144,6 +144,7 @@ class ArchiveSelector * @param array $periods * @param Segment $segment * @param array $plugins List of plugin names for which data is being requested. + * @param bool $includeInvalidated true to include archives that are DONE_INVALIDATED, false if only DONE_OK. * @return array Archive IDs are grouped by archive name and period range, ie, * array( * 'VisitsSummary.done' => array( @@ -152,7 +153,7 @@ class ArchiveSelector * ) * @throws */ - public static function getArchiveIds($siteIds, $periods, $segment, $plugins) + public static function getArchiveIds($siteIds, $periods, $segment, $plugins, $includeInvalidated = true) { if (empty($siteIds)) { throw new \Exception("Website IDs could not be read from the request, ie. idSite="); @@ -165,7 +166,7 @@ class ArchiveSelector $getArchiveIdsSql = "SELECT idsite, name, date1, date2, MAX(idarchive) as idarchive FROM %s WHERE idsite IN (" . implode(',', $siteIds) . ") - AND " . self::getNameCondition($plugins, $segment) . " + AND " . self::getNameCondition($plugins, $segment, $includeInvalidated) . " AND %s GROUP BY idsite, date1, date2, name"; @@ -365,16 +366,17 @@ class ArchiveSelector * * @param array $plugins * @param Segment $segment + * @param bool $includeInvalidated * @return string */ - private static function getNameCondition(array $plugins, Segment $segment) + private static function getNameCondition(array $plugins, Segment $segment, $includeInvalidated = true) { // the flags used to tell how the archiving process for a specific archive was completed, // if it was completed $doneFlags = Rules::getDoneFlags($plugins, $segment); $allDoneFlags = "'" . implode("','", $doneFlags) . "'"; - $possibleValues = Rules::getSelectableDoneFlagValues(); + $possibleValues = Rules::getSelectableDoneFlagValues($includeInvalidated); // create the SQL to find archives that are DONE return "((name IN ($allDoneFlags)) AND (value IN (" . implode(',', $possibleValues) . ")))"; diff --git a/tests/PHPUnit/Integration/CronArchiveTest.php b/tests/PHPUnit/Integration/CronArchiveTest.php index 4f4f2f9588..74f0dc561b 100644 --- a/tests/PHPUnit/Integration/CronArchiveTest.php +++ b/tests/PHPUnit/Integration/CronArchiveTest.php @@ -151,22 +151,22 @@ Starting Matomo reports archiving... Will pre-process for website id = 1, period = day, date = last%s - pre-processing all visits - skipping segment archiving for 'actions>=4'. -- pre-processing segment 1/1 actions>=2 +- pre-processing segment 1/1 actions>=2 [date = last52] Archived website id = 1, period = day, 1 segments, 1 visits in last %s days, 1 visits today, Time elapsed: %s +- skipping segment archiving for 'actions>=4'. Will pre-process for website id = 1, period = week, date = last%s - pre-processing all visits -- skipping segment archiving for 'actions>=4'. -- pre-processing segment 1/1 actions>=2 +- pre-processing segment 1/1 actions>=2 [date = last260] Archived website id = 1, period = week, 1 segments, 1 visits in last %s weeks, 1 visits this week, Time elapsed: %s +- skipping segment archiving for 'actions>=4'. Will pre-process for website id = 1, period = month, date = last%s - pre-processing all visits -- skipping segment archiving for 'actions>=4'. -- pre-processing segment 1/1 actions>=2 +- pre-processing segment 1/1 actions>=2 [date = last52] Archived website id = 1, period = month, 1 segments, 1 visits in last %s months, 1 visits this month, Time elapsed: %s +- skipping segment archiving for 'actions>=4'. Will pre-process for website id = 1, period = year, date = last%s - pre-processing all visits -- skipping segment archiving for 'actions>=4'. -- pre-processing segment 1/1 actions>=2 +- pre-processing segment 1/1 actions>=2 [date = last7] Archived website id = 1, period = year, 1 segments, 1 visits in last %s years, 1 visits this year, Time elapsed: %s Archived website id = 1, %s API requests, Time elapsed: %s [1/1 done] Done archiving! diff --git a/tests/PHPUnit/Unit/Archiver/RequestTest.php b/tests/PHPUnit/Unit/Archiver/RequestTest.php new file mode 100644 index 0000000000..ae168023c2 --- /dev/null +++ b/tests/PHPUnit/Unit/Archiver/RequestTest.php @@ -0,0 +1,47 @@ +<?php +/** + * Piwik - free/libre analytics platform + * + * @link https://matomo.org + * @license http://www.gnu.org/licenses/gpl-3.0.html GPL v3 or later + * + */ + +namespace Piwik\Tests\Unit\Archiver; + + +use Piwik\Archiver\Request; + +class RequestTest extends \PHPUnit_Framework_TestCase +{ + /** + * @dataProvider getTestDataForChangeDate + */ + public function test_changeDate_replacesDateProperly($url, $newDate, $expectedNewUrl) + { + $request = new Request($url); + $request->changeDate($newDate); + $this->assertEquals($expectedNewUrl, $request->getUrl()); + } + + public function getTestDataForChangeDate() + { + return [ + [ + 'http://abc.com/index.php?trigger=archivephp&method=API.get&date=2012-03-04', + 'last12', + 'http://abc.com/index.php?trigger=archivephp&method=API.get&date=last12', + ], + [ + 'http://abc.com/index.php?trigger=archivephp&method=API.get&date=2012-03-04,2013-02-4&period=day', + 'previous18', + 'http://abc.com/index.php?trigger=archivephp&method=API.get&date=previous18&period=day', + ], + [ + 'http://abc.com/index.php?date=lastN&period=day', + '2013-10-12,2013-11-19', + 'http://abc.com/index.php?date=2013-10-12,2013-11-19&period=day', + ], + ]; + } +}
\ No newline at end of file |