Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/matomo-org/matomo.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authordiosmosis <diosmosis@users.noreply.github.com>2020-01-08 11:05:41 +0300
committerMatthieu Aubry <mattab@users.noreply.github.com>2020-01-08 11:05:41 +0300
commit3c2b5f714f85457796e3e190781d3e592822ebf7 (patch)
tree5e7049fd0bac597481bfe70f972602af5c0220b9
parent18b0815a78d158b45afbd752c7e21581321acb4c (diff)
Do not proceed with archiving if a valid archive exists for the parameter combination. (#14937)
* Do not proceed with archiving if a valid archive exists for the parameter combination. * test manually and get to work * Apply some review feedback. * apply more review feedback * Do not abort archiving if archive found for non-day period so segments will still archive. * Apply more review feedback. * Make new optimization aware of whole periods. * update submodule * In CronArchive modify lastN date parameter to use oldest invalidated archive. * Move date changing logic to method and unit test. * Fix test.
-rw-r--r--core/ArchiveProcessor/Rules.php8
-rw-r--r--core/Archiver/Request.php23
-rw-r--r--core/CronArchive.php214
-rw-r--r--core/DataAccess/ArchiveSelector.php14
-rw-r--r--tests/PHPUnit/Integration/CronArchiveTest.php14
-rw-r--r--tests/PHPUnit/Unit/Archiver/RequestTest.php47
6 files changed, 252 insertions, 68 deletions
diff --git a/core/ArchiveProcessor/Rules.php b/core/ArchiveProcessor/Rules.php
index f054ce5d74..bf0f73b836 100644
--- a/core/ArchiveProcessor/Rules.php
+++ b/core/ArchiveProcessor/Rules.php
@@ -291,13 +291,15 @@ class Rules
/**
* Returns done flag values allowed to be selected
*
- * @return string
+ * @return string[]
*/
- public static function getSelectableDoneFlagValues()
+ public static function getSelectableDoneFlagValues($includeInvalidated = true)
{
$possibleValues = array(ArchiveWriter::DONE_OK, ArchiveWriter::DONE_OK_TEMPORARY);
- if (!Rules::isRequestAuthorizedToArchive()) {
+ if (!Rules::isRequestAuthorizedToArchive()
+ && $includeInvalidated
+ ) {
//If request is not authorized to archive then fetch also invalidated archives
$possibleValues[] = ArchiveWriter::DONE_INVALIDATED;
}
diff --git a/core/Archiver/Request.php b/core/Archiver/Request.php
index 4658366a2c..e08b23ac54 100644
--- a/core/Archiver/Request.php
+++ b/core/Archiver/Request.php
@@ -50,4 +50,27 @@ class Request
{
return $this->url;
}
+
+ /**
+ * @return string
+ */
+ public function getUrl()
+ {
+ return $this->url;
+ }
+
+ /**
+ * @param string $url
+ */
+ public function setUrl($url)
+ {
+ $this->url = $url;
+ }
+
+ public function changeDate($newDate)
+ {
+ $url = $this->getUrl();
+ $url = preg_replace('/([&?])date=[^&]*/', '$1date=' . $newDate, $url);
+ $this->setUrl($url);
+ }
}
diff --git a/core/CronArchive.php b/core/CronArchive.php
index 83a5911c4b..8b91161f6e 100644
--- a/core/CronArchive.php
+++ b/core/CronArchive.php
@@ -9,6 +9,7 @@
namespace Piwik;
use Exception;
+use Piwik\ArchiveProcessor\Parameters;
use Piwik\ArchiveProcessor\PluginsArchiver;
use Piwik\ArchiveProcessor\Rules;
use Piwik\Archiver\Request;
@@ -18,12 +19,15 @@ use Piwik\CronArchive\FixedSiteIds;
use Piwik\CronArchive\Performance\Logger;
use Piwik\CronArchive\SharedSiteIds;
use Piwik\Archive\ArchiveInvalidator;
+use Piwik\DataAccess\ArchiveSelector;
use Piwik\DataAccess\RawLogDao;
use Piwik\Exception\UnexpectedWebsiteFoundException;
use Piwik\Metrics\Formatter;
+use Piwik\Period\Factory;
use Piwik\Period\Factory as PeriodFactory;
use Piwik\CronArchive\SitesToReprocessDistributedList;
use Piwik\CronArchive\SegmentArchivingRequestUrlProvider;
+use Piwik\Period\Range;
use Piwik\Plugins\CoreAdminHome\API as CoreAdminHomeAPI;
use Piwik\Plugins\SegmentEditor\Model as SegmentEditorModel;
use Piwik\Plugins\SitesManager\API as APISitesManager;
@@ -478,7 +482,7 @@ class CronArchive
try {
if ($this->isWebsiteUsingTheTracker($idSite)) {
- if(!$this->hadWebsiteTrafficSinceMidnightInTimezone($idSite)) {
+ if (!$this->hadWebsiteTrafficSinceMidnightInTimezone($idSite)) {
$this->logger->info("Skipped website id $idSite as archiving is not needed");
$this->skippedDayNoRecentData++;
@@ -831,7 +835,6 @@ class CronArchive
$request = "?module=API&method=API.get&idSite=$idSite&period=$period&date=" . $date . "&format=php";
if ($segment) {
$request .= '&segment=' . urlencode($segment);
- ;
}
return $request;
}
@@ -886,8 +889,6 @@ class CronArchive
$date = $this->getApiDateParameter($idSite, "day", $processDaysSince);
$url = $this->getVisitsRequestUrl($idSite, "day", $date);
- $this->logArchiveWebsite($idSite, "day", $date);
-
$cliMulti = $this->makeCliMulti();
if ($cliMulti->isCommandAlreadyRunning($this->makeRequestUrl($url))) {
$this->logger->info("Skipped website id $idSite, such a process is already in progress, " . $timerWebsite->__toString());
@@ -895,53 +896,69 @@ class CronArchive
return false;
}
- $content = $this->request($url);
- $daysResponse = Common::safe_unserialize($content);
+ $visitsLastDays = 0;
- if (empty($content)
- || !is_array($daysResponse)
- || count($daysResponse) == 0
- ) {
- // cancel marking the site as reprocessed
- if ($websiteInvalidatedShouldReprocess) {
- $store = new SitesToReprocessDistributedList();
- $store->add($idSite);
- }
+ list($isThereArchive, $newDate) = $this->isThereAValidArchiveForPeriod($idSite, 'day', $date, $segment = '');
+ if ($isThereArchive) {
+ $visitsToday = Archive::build($idSite, 'day', $date)->getNumeric('nb_visits');
+ $visitsToday = end($visitsToday);
+ $visitsToday = isset($visitsToday['nb_visits']) ? $visitsToday['nb_visits'] : 0;
- $this->logError("Empty or invalid response '$content' for website id $idSite, " . $timerWebsite->__toString() . ", skipping");
- $this->skippedDayOnApiError++;
- $this->skipped++;
- return false;
- }
+ $this->logArchiveWebsiteSkippedValidArchiveExists($idSite, 'day', $date);
+ ++$this->skipped;
+ } else {
+ $date = $newDate; // use modified lastN param
- $visitsToday = $this->getVisitsLastPeriodFromApiResponse($daysResponse);
- $visitsLastDays = $this->getVisitsFromApiResponse($daysResponse);
+ $this->logArchiveWebsite($idSite, "day", $date);
- $this->requests++;
- $this->processed++;
+ $content = $this->request($url);
+ $daysResponse = Common::safe_unserialize($content);
- $shouldArchiveWithoutVisits = PluginsArchiver::doesAnyPluginArchiveWithoutVisits();
+ if (empty($content)
+ || !is_array($daysResponse)
+ || count($daysResponse) == 0
+ ) {
+ // cancel marking the site as reprocessed
+ if ($websiteInvalidatedShouldReprocess) {
+ $store = new SitesToReprocessDistributedList();
+ $store->add($idSite);
+ }
- // If there is no visit today and we don't need to process this website, we can skip remaining archives
- if (
- 0 == $visitsToday && !$shouldArchiveWithoutVisits
- && !$shouldArchivePeriods
- ) {
- $this->logger->info("Skipped website id $idSite, no visit today, " . $timerWebsite->__toString());
- $this->skippedDayNoRecentData++;
- $this->skipped++;
- return false;
- }
+ $this->logError("Empty or invalid response '$content' for website id $idSite, " . $timerWebsite->__toString() . ", skipping");
+ $this->skippedDayOnApiError++;
+ $this->skipped++;
+ return false;
+ }
- if (0 == $visitsLastDays && !$shouldArchiveWithoutVisits
- && !$shouldArchivePeriods
- && $this->shouldArchiveAllSites
- ) {
- $humanReadableDate = $this->formatReadableDateRange($date);
- $this->logger->info("Skipped website id $idSite, no visits in the $humanReadableDate days, " . $timerWebsite->__toString());
- $this->skippedPeriodsNoDataInPeriod++;
- $this->skipped++;
- return false;
+ $visitsToday = $this->getVisitsLastPeriodFromApiResponse($daysResponse);
+ $visitsLastDays = $this->getVisitsFromApiResponse($daysResponse);
+
+ $this->requests++;
+ $this->processed++;
+
+ $shouldArchiveWithoutVisits = PluginsArchiver::doesAnyPluginArchiveWithoutVisits();
+
+ // If there is no visit today and we don't need to process this website, we can skip remaining archives
+ if (
+ 0 == $visitsToday && !$shouldArchiveWithoutVisits
+ && !$shouldArchivePeriods
+ ) {
+ $this->logger->info("Skipped website id $idSite, no visit today, " . $timerWebsite->__toString());
+ $this->skippedDayNoRecentData++;
+ $this->skipped++;
+ return false;
+ }
+
+ if (0 == $visitsLastDays && !$shouldArchiveWithoutVisits
+ && !$shouldArchivePeriods
+ && $this->shouldArchiveAllSites
+ ) {
+ $humanReadableDate = $this->formatReadableDateRange($date);
+ $this->logger->info("Skipped website id $idSite, no visits in the $humanReadableDate days, " . $timerWebsite->__toString());
+ $this->skippedPeriodsNoDataInPeriod++;
+ $this->skipped++;
+ return false;
+ }
}
$this->visitsToday += $visitsToday;
@@ -955,6 +972,67 @@ class CronArchive
return $dayArchiveWasSuccessful;
}
+ private function isThereAValidArchiveForPeriod($idSite, $period, $date, $segment = '')
+ {
+ if (Range::isMultiplePeriod($date, $period)) {
+ $rangePeriod = Factory::build($period, $date, Site::getTimezoneFor($idSite));
+ $periodsToCheck = $rangePeriod->getSubperiods();
+ } else {
+ $periodsToCheck = [Factory::build($period, $date, Site::getTimezoneFor($idSite))];
+ }
+
+ $periodsToCheckRanges = array_map(function (Period $p) { return $p->getRangeString(); }, $periodsToCheck);
+
+ $this->invalidateArchivedReportsForSitesThatNeedToBeArchivedAgain();
+
+ $archiveIds = ArchiveSelector::getArchiveIds(
+ [$idSite], $periodsToCheck, new Segment($segment, [$idSite]), $plugins = [], // empty plugins param since we only check for an 'all' archive
+ $includeInvalidated = false
+ );
+
+ $foundArchivePeriods = [];
+ foreach ($archiveIds as $doneFlag => $dates) {
+ foreach ($dates as $dateRange => $idArchives) {
+ $foundArchivePeriods[] = $dateRange;
+ }
+ }
+
+ $diff = array_diff($periodsToCheckRanges, $foundArchivePeriods);
+ $isThereArchiveForAllPeriods = empty($diff);
+
+ // if there is an invalidated archive within the range, find out the oldest one and how far it is from today,
+ // and change the lastN $date to be value so it is correctly re-processed.
+ $newDate = $date;
+ if (!$isThereArchiveForAllPeriods
+ && preg_match('/^last([0-9]+)/', $date, $matches)
+ ) {
+ $lastNValue = (int) $matches[1];
+
+ usort($diff, function ($lhs, $rhs) {
+ $lhsDate = explode(',', $lhs)[0];
+ $rhsDate = explode(',', $rhs)[0];
+
+ if ($lhsDate == $rhsDate) {
+ return 1;
+ } else if (Date::factory($lhsDate)->isEarlier(Date::factory($rhsDate))) {
+ return -1;
+ } else {
+ return 1;
+ }
+ });
+
+ $oldestDateWithoutArchive = explode(',', reset($diff))[0];
+ $todayInTimezone = Date::factoryInTimezone('today', Site::getTimezoneFor($idSite));
+
+ /** @var Range $newRangePeriod */
+ $newRangePeriod = PeriodFactory::build($period, $oldestDateWithoutArchive . ',' . $todayInTimezone);
+
+ $newDate = 'last' . min($lastNValue, $newRangePeriod->getNumberOfSubperiods());
+ }
+
+ return [$isThereArchiveForAllPeriods, $newDate];
+ }
+
/**
* @param $idSite
* @return array
@@ -1019,13 +1097,22 @@ class CronArchive
$self = $this;
$request = new Request($url);
- $request->before(function () use ($self, $url, $idSite, $period, $date) {
+ $request->before(function () use ($self, $url, $idSite, $period, $date, $segment, $request) {
if ($self->isAlreadyArchivingUrl($url, $idSite, $period, $date)) {
- return Request::ABORT;
+ return Request::ABORT;
+ }
+
+ list($isThereArchive, $newDate) = $this->isThereAValidArchiveForPeriod($idSite, $period, $date, $segment);
+ if ($isThereArchive) {
+ $this->logArchiveWebsiteSkippedValidArchiveExists($idSite, $period, $date);
+ return Request::ABORT;
}
+
+ $request->changeDate($newDate);
+
+ $this->logArchiveWebsite($idSite, $period, $newDate);
});
$urls[] = $request;
- $this->logArchiveWebsite($idSite, $period, $date);
}
$segmentRequestsCount = 0;
@@ -1070,6 +1157,18 @@ class CronArchive
return $success;
}
+ // TODO: need test to make sure segment archives are invalidated as well
+ private function logArchiveWebsiteSkippedValidArchiveExists($idSite, $period, $date, $segment = '')
+ {
+ $this->logger->info("Skipping archiving for website id = {idSite}, period = {period}, date = {date}, segment = {segment}, "
+ . "since there is already a valid archive (tracking a visit automatically invalidates archives).", [
+ 'idSite' => $idSite,
+ 'period' => $period,
+ 'date' => $date,
+ 'segment' => $segment,
+ ]);
+ }
+
/**
* Logs a section in the output
*
@@ -1294,7 +1393,7 @@ class CronArchive
{
$timezone = Site::getTimezoneFor($idSite);
- $nowInTimezone = Date::factory('now', $timezone);
+ $nowInTimezone = Date::factoryInTimezone('now', $timezone);
$midnightInTimezone = $nowInTimezone->setTime('00:00:00');
$secondsSinceMidnight = $nowInTimezone->getTimestamp() - $midnightInTimezone->getTimestamp();
@@ -1832,6 +1931,7 @@ class CronArchive
$segments[] = $segment;
}
+
$segmentCount = count($segments);
$processedSegmentCount = 0;
@@ -1865,18 +1965,28 @@ class CronArchive
$request = new Request($urlWithSegment);
$logger = $this->logger;
$self = $this;
- $request->before(function () use ($logger, $segment, $segmentCount, &$processedSegmentCount, $idSite, $period, $urlWithSegment, $self) {
-
+ $request->before(function () use ($logger, $segment, $segmentCount, &$processedSegmentCount, $idSite, $period, $date, $urlWithSegment, $self, $request) {
if ($self->isAlreadyArchivingSegment($urlWithSegment, $idSite, $period, $segment)) {
return Request::ABORT;
}
+ list($isThereArchive, $newDate) = $this->isThereAValidArchiveForPeriod($idSite, $period, $date, $segment);
+ if ($isThereArchive) {
+ $this->logArchiveWebsiteSkippedValidArchiveExists($idSite, $period, $date, $segment);
+ return Request::ABORT;
+ }
+
+ $url = $request->getUrl();
+ $url = preg_replace('/([&?])date=[^&]*/', '$1date=' . $newDate, $url);
+ $request->setUrl($url);
+
$processedSegmentCount++;
$logger->info(sprintf(
- '- pre-processing segment %d/%d %s',
+ '- pre-processing segment %d/%d %s [date = %s]',
$processedSegmentCount,
$segmentCount,
- $segment
+ $segment,
+ $newDate
));
});
diff --git a/core/DataAccess/ArchiveSelector.php b/core/DataAccess/ArchiveSelector.php
index d8063c33c0..21d7a9f626 100644
--- a/core/DataAccess/ArchiveSelector.php
+++ b/core/DataAccess/ArchiveSelector.php
@@ -51,7 +51,7 @@ class ArchiveSelector
* @return array|bool
* @throws Exception
*/
- public static function getArchiveIdAndVisits(ArchiveProcessor\Parameters $params, $minDatetimeArchiveProcessedUTC = false)
+ public static function getArchiveIdAndVisits(ArchiveProcessor\Parameters $params, $minDatetimeArchiveProcessedUTC = false, $includeInvalidated = true)
{
$idSite = $params->getSite()->getId();
$period = $params->getPeriod()->getId();
@@ -71,7 +71,7 @@ class ArchiveSelector
$plugins = array("VisitsSummary", $requestedPlugin);
$doneFlags = Rules::getDoneFlags($plugins, $segment);
- $doneFlagValues = Rules::getSelectableDoneFlagValues();
+ $doneFlagValues = Rules::getSelectableDoneFlagValues($includeInvalidated);
$results = self::getModel()->getArchiveIdAndVisits($numericTable, $idSite, $period, $dateStartIso, $dateEndIso, $minDatetimeIsoArchiveProcessedUTC, $doneFlags, $doneFlagValues);
@@ -144,6 +144,7 @@ class ArchiveSelector
* @param array $periods
* @param Segment $segment
* @param array $plugins List of plugin names for which data is being requested.
+ * @param bool $includeInvalidated true to include archives that are DONE_INVALIDATED, false if only DONE_OK.
* @return array Archive IDs are grouped by archive name and period range, ie,
* array(
* 'VisitsSummary.done' => array(
@@ -152,7 +153,7 @@ class ArchiveSelector
* )
* @throws
*/
- public static function getArchiveIds($siteIds, $periods, $segment, $plugins)
+ public static function getArchiveIds($siteIds, $periods, $segment, $plugins, $includeInvalidated = true)
{
if (empty($siteIds)) {
throw new \Exception("Website IDs could not be read from the request, ie. idSite=");
@@ -165,7 +166,7 @@ class ArchiveSelector
$getArchiveIdsSql = "SELECT idsite, name, date1, date2, MAX(idarchive) as idarchive
FROM %s
WHERE idsite IN (" . implode(',', $siteIds) . ")
- AND " . self::getNameCondition($plugins, $segment) . "
+ AND " . self::getNameCondition($plugins, $segment, $includeInvalidated) . "
AND %s
GROUP BY idsite, date1, date2, name";
@@ -365,16 +366,17 @@ class ArchiveSelector
*
* @param array $plugins
* @param Segment $segment
+ * @param bool $includeInvalidated
* @return string
*/
- private static function getNameCondition(array $plugins, Segment $segment)
+ private static function getNameCondition(array $plugins, Segment $segment, $includeInvalidated = true)
{
// the flags used to tell how the archiving process for a specific archive was completed,
// if it was completed
$doneFlags = Rules::getDoneFlags($plugins, $segment);
$allDoneFlags = "'" . implode("','", $doneFlags) . "'";
- $possibleValues = Rules::getSelectableDoneFlagValues();
+ $possibleValues = Rules::getSelectableDoneFlagValues($includeInvalidated);
// create the SQL to find archives that are DONE
return "((name IN ($allDoneFlags)) AND (value IN (" . implode(',', $possibleValues) . ")))";
diff --git a/tests/PHPUnit/Integration/CronArchiveTest.php b/tests/PHPUnit/Integration/CronArchiveTest.php
index 4f4f2f9588..74f0dc561b 100644
--- a/tests/PHPUnit/Integration/CronArchiveTest.php
+++ b/tests/PHPUnit/Integration/CronArchiveTest.php
@@ -151,22 +151,22 @@ Starting Matomo reports archiving...
Will pre-process for website id = 1, period = day, date = last%s
- pre-processing all visits
- skipping segment archiving for 'actions>=4'.
-- pre-processing segment 1/1 actions>=2
+- pre-processing segment 1/1 actions>=2 [date = last52]
Archived website id = 1, period = day, 1 segments, 1 visits in last %s days, 1 visits today, Time elapsed: %s
+- skipping segment archiving for 'actions>=4'.
Will pre-process for website id = 1, period = week, date = last%s
- pre-processing all visits
-- skipping segment archiving for 'actions>=4'.
-- pre-processing segment 1/1 actions>=2
+- pre-processing segment 1/1 actions>=2 [date = last260]
Archived website id = 1, period = week, 1 segments, 1 visits in last %s weeks, 1 visits this week, Time elapsed: %s
+- skipping segment archiving for 'actions>=4'.
Will pre-process for website id = 1, period = month, date = last%s
- pre-processing all visits
-- skipping segment archiving for 'actions>=4'.
-- pre-processing segment 1/1 actions>=2
+- pre-processing segment 1/1 actions>=2 [date = last52]
Archived website id = 1, period = month, 1 segments, 1 visits in last %s months, 1 visits this month, Time elapsed: %s
+- skipping segment archiving for 'actions>=4'.
Will pre-process for website id = 1, period = year, date = last%s
- pre-processing all visits
-- skipping segment archiving for 'actions>=4'.
-- pre-processing segment 1/1 actions>=2
+- pre-processing segment 1/1 actions>=2 [date = last7]
Archived website id = 1, period = year, 1 segments, 1 visits in last %s years, 1 visits this year, Time elapsed: %s
Archived website id = 1, %s API requests, Time elapsed: %s [1/1 done]
Done archiving!
diff --git a/tests/PHPUnit/Unit/Archiver/RequestTest.php b/tests/PHPUnit/Unit/Archiver/RequestTest.php
new file mode 100644
index 0000000000..ae168023c2
--- /dev/null
+++ b/tests/PHPUnit/Unit/Archiver/RequestTest.php
@@ -0,0 +1,47 @@
+<?php
+/**
+ * Piwik - free/libre analytics platform
+ *
+ * @link https://matomo.org
+ * @license http://www.gnu.org/licenses/gpl-3.0.html GPL v3 or later
+ *
+ */
+
+namespace Piwik\Tests\Unit\Archiver;
+
+
+use Piwik\Archiver\Request;
+
+class RequestTest extends \PHPUnit_Framework_TestCase
+{
+ /**
+ * @dataProvider getTestDataForChangeDate
+ */
+ public function test_changeDate_replacesDateProperly($url, $newDate, $expectedNewUrl)
+ {
+ $request = new Request($url);
+ $request->changeDate($newDate);
+ $this->assertEquals($expectedNewUrl, $request->getUrl());
+ }
+
+ public function getTestDataForChangeDate()
+ {
+ return [
+ [
+ 'http://abc.com/index.php?trigger=archivephp&method=API.get&date=2012-03-04',
+ 'last12',
+ 'http://abc.com/index.php?trigger=archivephp&method=API.get&date=last12',
+ ],
+ [
+ 'http://abc.com/index.php?trigger=archivephp&method=API.get&date=2012-03-04,2013-02-4&period=day',
+ 'previous18',
+ 'http://abc.com/index.php?trigger=archivephp&method=API.get&date=previous18&period=day',
+ ],
+ [
+ 'http://abc.com/index.php?date=lastN&period=day',
+ '2013-10-12,2013-11-19',
+ 'http://abc.com/index.php?date=2013-10-12,2013-11-19&period=day',
+ ],
+ ];
+ }
+} \ No newline at end of file