Welcome to mirror list, hosted at ThFree Co, Russian Federation.

heartbeat.cc « filed « src « core - github.com/bareos/bareos.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
blob: 9f5440562d52ff2aa28ae3f92dfc29b435f0b08f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
/*
   BAREOS® - Backup Archiving REcovery Open Sourced

   Copyright (C) 2003-2012 Free Software Foundation Europe e.V.
   Copyright (C) 2011-2012 Planets Communications B.V.
   Copyright (C) 2013-2020 Bareos GmbH & Co. KG

   This program is Free Software; you can redistribute it and/or
   modify it under the terms of version three of the GNU Affero General Public
   License as published by the Free Software Foundation and included
   in the file LICENSE.

   This program is distributed in the hope that it will be useful, but
   WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
   Affero General Public License for more details.

   You should have received a copy of the GNU Affero General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
   02110-1301, USA.
*/
// Kern Sibbald, May MMIII
/**
 * @file
 * Bareos File Daemon heartbeat routines
 * Listens for heartbeats coming from the SD
 * If configured, sends heartbeats to Dir
 */

#include "include/bareos.h"
#include "filed/filed.h"
#include "filed/jcr_private.h"
#include "filed/filed_globals.h"
#include "lib/bnet.h"
#include "lib/bsock.h"
#include "lib/watchdog.h"

namespace filedaemon {

#define WAIT_INTERVAL 5

extern "C" void* sd_heartbeat_thread(void* arg);
extern "C" void* dir_heartbeat_thread(void* arg);
extern bool no_signals;

/**
 * Listen on the SD socket for heartbeat signals.
 * Send heartbeats to the Director every HB_TIME
 *   seconds.
 */
extern "C" void* sd_heartbeat_thread(void* arg)
{
  int32_t n;
  JobControlRecord* jcr = (JobControlRecord*)arg;
  std::shared_ptr<BareosSocket> sd, dir;
  time_t last_heartbeat = time(NULL);
  time_t now;

  pthread_detach(pthread_self());

  // Get our own local copy
  sd.reset(jcr->store_bsock->clone());
  dir.reset(jcr->dir_bsock->clone());

  jcr->impl->hb_bsock = sd;
  jcr->impl->hb_running = true;
  jcr->impl->hb_dir_bsock = dir;
  dir->suppress_error_msgs_ = true;
  sd->suppress_error_msgs_ = true;
  jcr->impl->hb_initialized_once
      = true;  // initialize last to avoid race condition

  /* Hang reading the socket to the SD, and every time we get
   * a heartbeat or we get a wait timeout (5 seconds), we
   * check to see if we need to send a heartbeat to the
   * Director.
   */
  while (!sd->IsStop()) {
    n = BnetWaitDataIntr(sd.get(), WAIT_INTERVAL);
    if (n < 0 || sd->IsStop()) { break; }
    if (me->heartbeat_interval) {
      now = time(NULL);
      if (now - last_heartbeat >= me->heartbeat_interval) {
        dir->signal(BNET_HEARTBEAT);
        if (dir->IsStop()) { break; }
        last_heartbeat = now;
      }
    }
    if (n == 1) { /* input waiting */
      sd->recv(); /* read it -- probably heartbeat from sd */
      if (sd->IsStop()) { break; }
      if (sd->message_length <= 0) {
        Dmsg1(100, "Got BNET_SIG %d from SD\n", sd->message_length);
      } else {
        Dmsg2(100, "Got %d bytes from SD. MSG=%s\n", sd->message_length,
              sd->msg);
      }
    }
    Dmsg2(200, "wait_intr=%d stop=%d\n", n, IsBnetStop(sd.get()));
  }

  sd->close();
  dir->close();
  jcr->impl->hb_bsock.reset();
  jcr->impl->hb_running = false;
  jcr->impl->hb_dir_bsock = NULL;

  return NULL;
}

/* Startup the heartbeat thread -- see above */
void StartHeartbeatMonitor(JobControlRecord* jcr)
{
  /*
   * If no signals are set, do not start the heartbeat because
   * it gives a constant stream of TIMEOUT_SIGNAL signals that
   * make debugging impossible.
   */
  if (!no_signals) {
    jcr->impl->hb_bsock = NULL;
    jcr->impl->hb_running = false;
    jcr->impl->hb_initialized_once = false;
    jcr->impl->hb_dir_bsock = NULL;
    pthread_create(&jcr->impl->heartbeat_id, NULL, sd_heartbeat_thread,
                   (void*)jcr);
  }
}

/* Terminate the heartbeat thread. Used for both SD and DIR */
void StopHeartbeatMonitor(JobControlRecord* jcr)
{
  int cnt = 0;
  if (no_signals) { return; }

  /* Wait max 10 secs for heartbeat thread to start */
  while (!jcr->impl->hb_initialized_once && cnt++ < 200) {
    Bmicrosleep(0, 50000); /* wait for start */
  }

  if (jcr->impl->hb_running) {
    jcr->impl->hb_bsock->SetTimedOut();   /* set timed_out to Terminate read */
    jcr->impl->hb_bsock->SetTerminated(); /* set to Terminate read */
  }

  if (jcr->impl->hb_dir_bsock) {
    jcr->impl->hb_dir_bsock
        ->SetTimedOut(); /* set timed_out to Terminate read */
    jcr->impl->hb_dir_bsock->SetTerminated(); /* set to Terminate read */
  }

  if (jcr->impl->hb_running) {
    Dmsg0(100, "Send kill to heartbeat id\n");
    pthread_kill(jcr->impl->heartbeat_id,
                 TIMEOUT_SIGNAL); /* make heartbeat thread go away */
    Bmicrosleep(0, 50000);
  }
  cnt = 0;

  // Wait max 100 secs for heartbeat thread to stop
  while (jcr->impl->hb_running && cnt++ < 200) {
    pthread_kill(jcr->impl->heartbeat_id,
                 TIMEOUT_SIGNAL); /* make heartbeat thread go away */
    Bmicrosleep(0, 500000);
  }

  if (jcr->impl->hb_bsock) {
    // delete jcr->impl_->hb_bsock;
    jcr->impl->hb_bsock.reset();
  }

  if (jcr->impl->hb_dir_bsock) {
    // delete jcr->impl_->hb_dir_bsock;
    jcr->impl->hb_dir_bsock.reset();
  }

  jcr->impl->hb_initialized_once = false;
}

/**
 * Thread for sending heartbeats to the Director when there
 *   is no SD monitoring needed -- e.g. restore and verify Vol
 *   both do their own read() on the SD socket.
 */
extern "C" void* dir_heartbeat_thread(void* arg)
{
  JobControlRecord* jcr = (JobControlRecord*)arg;
  BareosSocket* dir;
  time_t last_heartbeat = time(NULL);

  pthread_detach(pthread_self());

  // Get our own local copy
  dir = jcr->dir_bsock->clone();

  jcr->impl->hb_bsock.reset(dir);
  jcr->impl->hb_running = true;
  dir->suppress_error_msgs_ = true;
  jcr->impl->hb_initialized_once
      = true;  // initialize last to avoid race condition

  while (!dir->IsStop()) {
    time_t now, next;

    now = time(NULL);
    next = now - last_heartbeat;
    if (next >= me->heartbeat_interval) {
      dir->signal(BNET_HEARTBEAT);
      if (dir->IsStop()) { break; }
      last_heartbeat = now;
    }
    Bmicrosleep(next, 0);
  }
  dir->close();
  jcr->impl->hb_bsock.reset();
  jcr->impl->hb_running = false;
  return NULL;
}

// Same as above but we don't listen to the SD
void StartDirHeartbeat(JobControlRecord* jcr)
{
  if (me->heartbeat_interval) {
    jcr->dir_bsock->SetLocking();
    pthread_create(&jcr->impl->heartbeat_id, NULL, dir_heartbeat_thread,
                   (void*)jcr);
  }
}

void StopDirHeartbeat(JobControlRecord* jcr)
{
  if (me->heartbeat_interval) { StopHeartbeatMonitor(jcr); }
}
} /* namespace filedaemon */