root/daemons/controld/controld_remote_ra.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. free_cmd
  2. generate_callid
  3. recurring_helper
  4. start_delay_helper
  5. should_purge_attributes
  6. section_to_delete
  7. purge_remote_node_attrs
  8. remote_node_up
  9. remote_node_down
  10. check_remote_node_state
  11. report_remote_ra_result
  12. update_remaining_timeout
  13. retry_start_cmd_cb
  14. connection_takeover_timeout_cb
  15. monitor_timeout_cb
  16. synthesize_lrmd_success
  17. remote_lrm_op_callback
  18. handle_remote_ra_stop
  19. handle_remote_ra_start
  20. handle_remote_ra_exec
  21. remote_ra_data_init
  22. remote_ra_cleanup
  23. is_remote_lrmd_ra
  24. remote_ra_get_rsc_info
  25. is_remote_ra_supported_action
  26. fail_all_monitor_cmds
  27. remove_cmd
  28. remote_ra_cancel
  29. handle_dup_monitor
  30. controld_execute_remote_agent
  31. remote_ra_fail
  32. remote_ra_process_pseudo
  33. remote_ra_maintenance
  34. remote_ra_process_maintenance_nodes
  35. remote_ra_is_in_maintenance
  36. remote_ra_controlling_guest

   1 /*
   2  * Copyright 2013-2023 the Pacemaker project contributors
   3  *
   4  * The version control history for this file may have further details.
   5  *
   6  * This source code is licensed under the GNU General Public License version 2
   7  * or later (GPLv2+) WITHOUT ANY WARRANTY.
   8  */
   9 
  10 #include <crm_internal.h>
  11 
  12 #include <crm/crm.h>
  13 #include <crm/msg_xml.h>
  14 #include <crm/common/xml_internal.h>
  15 #include <crm/lrmd.h>
  16 #include <crm/lrmd_internal.h>
  17 #include <crm/services.h>
  18 
  19 #include <pacemaker-controld.h>
  20 
  21 #define REMOTE_LRMD_RA "remote"
  22 
  23 /* The max start timeout before cmd retry */
  24 #define MAX_START_TIMEOUT_MS 10000
  25 
  26 #define cmd_set_flags(cmd, flags_to_set) do { \
  27     (cmd)->status = pcmk__set_flags_as(__func__, __LINE__, LOG_TRACE, \
  28                                        "Remote command", (cmd)->rsc_id, (cmd)->status, \
  29                                        (flags_to_set), #flags_to_set); \
  30         } while (0)
  31 
  32 #define cmd_clear_flags(cmd, flags_to_clear) do { \
  33     (cmd)->status = pcmk__clear_flags_as(__func__, __LINE__, LOG_TRACE, \
  34                                          "Remote command", (cmd)->rsc_id, (cmd)->status, \
  35                                          (flags_to_clear), #flags_to_clear); \
  36         } while (0)
  37 
  38 enum remote_cmd_status {
  39     cmd_reported_success    = (1 << 0),
  40     cmd_cancel              = (1 << 1),
  41 };
  42 
  43 typedef struct remote_ra_cmd_s {
  44     /*! the local node the cmd is issued from */
  45     char *owner;
  46     /*! the remote node the cmd is executed on */
  47     char *rsc_id;
  48     /*! the action to execute */
  49     char *action;
  50     /*! some string the client wants us to give it back */
  51     char *userdata;
  52     /*! start delay in ms */
  53     int start_delay;
  54     /*! timer id used for start delay. */
  55     int delay_id;
  56     /*! timeout in ms for cmd */
  57     int timeout;
  58     int remaining_timeout;
  59     /*! recurring interval in ms */
  60     guint interval_ms;
  61     /*! interval timer id */
  62     int interval_id;
  63     int monitor_timeout_id;
  64     int takeover_timeout_id;
  65     /*! action parameters */
  66     lrmd_key_value_t *params;
  67     pcmk__action_result_t result;
  68     int call_id;
  69     time_t start_time;
  70     uint32_t status;
  71 } remote_ra_cmd_t;
  72 
  73 #define lrm_remote_set_flags(lrm_state, flags_to_set) do { \
  74     lrm_state_t *lrm = (lrm_state); \
  75     remote_ra_data_t *ra = lrm->remote_ra_data; \
  76     ra->status = pcmk__set_flags_as(__func__, __LINE__, LOG_TRACE, "Remote", \
  77                                     lrm->node_name, ra->status, \
  78                                     (flags_to_set), #flags_to_set); \
  79         } while (0)
  80 
  81 #define lrm_remote_clear_flags(lrm_state, flags_to_clear) do { \
  82     lrm_state_t *lrm = (lrm_state); \
  83     remote_ra_data_t *ra = lrm->remote_ra_data; \
  84     ra->status = pcmk__clear_flags_as(__func__, __LINE__, LOG_TRACE, "Remote", \
  85                                       lrm->node_name, ra->status, \
  86                                       (flags_to_clear), #flags_to_clear); \
  87         } while (0)
  88 
  89 enum remote_status {
  90     expect_takeover     = (1 << 0),
  91     takeover_complete   = (1 << 1),
  92     remote_active       = (1 << 2),
  93     /* Maintenance mode is difficult to determine from the controller's context,
  94      * so we have it signalled back with the transition from the scheduler.
  95      */
  96     remote_in_maint     = (1 << 3),
  97     /* Similar for whether we are controlling a guest node or remote node.
  98      * Fortunately there is a meta-attribute in the transition already and
  99      * as the situation doesn't change over time we can use the
 100      * resource start for noting down the information for later use when
 101      * the attributes aren't at hand.
 102      */
 103     controlling_guest   = (1 << 4),
 104 };
 105 
 106 typedef struct remote_ra_data_s {
 107     crm_trigger_t *work;
 108     remote_ra_cmd_t *cur_cmd;
 109     GList *cmds;
 110     GList *recurring_cmds;
 111     uint32_t status;
 112 } remote_ra_data_t;
 113 
 114 static int handle_remote_ra_start(lrm_state_t * lrm_state, remote_ra_cmd_t * cmd, int timeout_ms);
 115 static void handle_remote_ra_stop(lrm_state_t * lrm_state, remote_ra_cmd_t * cmd);
 116 static GList *fail_all_monitor_cmds(GList * list);
 117 
 118 static void
 119 free_cmd(gpointer user_data)
     /* [previous][next][first][last][top][bottom][index][help] */
 120 {
 121     remote_ra_cmd_t *cmd = user_data;
 122 
 123     if (!cmd) {
 124         return;
 125     }
 126     if (cmd->delay_id) {
 127         g_source_remove(cmd->delay_id);
 128     }
 129     if (cmd->interval_id) {
 130         g_source_remove(cmd->interval_id);
 131     }
 132     if (cmd->monitor_timeout_id) {
 133         g_source_remove(cmd->monitor_timeout_id);
 134     }
 135     if (cmd->takeover_timeout_id) {
 136         g_source_remove(cmd->takeover_timeout_id);
 137     }
 138     free(cmd->owner);
 139     free(cmd->rsc_id);
 140     free(cmd->action);
 141     free(cmd->userdata);
 142     pcmk__reset_result(&(cmd->result));
 143     lrmd_key_value_freeall(cmd->params);
 144     free(cmd);
 145 }
 146 
 147 static int
 148 generate_callid(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 149 {
 150     static int remote_ra_callid = 0;
 151 
 152     remote_ra_callid++;
 153     if (remote_ra_callid <= 0) {
 154         remote_ra_callid = 1;
 155     }
 156 
 157     return remote_ra_callid;
 158 }
 159 
 160 static gboolean
 161 recurring_helper(gpointer data)
     /* [previous][next][first][last][top][bottom][index][help] */
 162 {
 163     remote_ra_cmd_t *cmd = data;
 164     lrm_state_t *connection_rsc = NULL;
 165 
 166     cmd->interval_id = 0;
 167     connection_rsc = lrm_state_find(cmd->rsc_id);
 168     if (connection_rsc && connection_rsc->remote_ra_data) {
 169         remote_ra_data_t *ra_data = connection_rsc->remote_ra_data;
 170 
 171         ra_data->recurring_cmds = g_list_remove(ra_data->recurring_cmds, cmd);
 172 
 173         ra_data->cmds = g_list_append(ra_data->cmds, cmd);
 174         mainloop_set_trigger(ra_data->work);
 175     }
 176     return FALSE;
 177 }
 178 
 179 static gboolean
 180 start_delay_helper(gpointer data)
     /* [previous][next][first][last][top][bottom][index][help] */
 181 {
 182     remote_ra_cmd_t *cmd = data;
 183     lrm_state_t *connection_rsc = NULL;
 184 
 185     cmd->delay_id = 0;
 186     connection_rsc = lrm_state_find(cmd->rsc_id);
 187     if (connection_rsc && connection_rsc->remote_ra_data) {
 188         remote_ra_data_t *ra_data = connection_rsc->remote_ra_data;
 189 
 190         mainloop_set_trigger(ra_data->work);
 191     }
 192     return FALSE;
 193 }
 194 
 195 static bool
 196 should_purge_attributes(crm_node_t *node)
     /* [previous][next][first][last][top][bottom][index][help] */
 197 {
 198     bool purge = true;
 199     crm_node_t *conn_node = NULL;
 200     lrm_state_t *connection_rsc = NULL;
 201 
 202     if (!node->conn_host) {
 203         return purge;
 204     }
 205 
 206     /* Get the node that was hosting the remote connection resource from the
 207      * peer cache.  That's the one we really care about here.
 208      */
 209     conn_node = crm_get_peer(0, node->conn_host);
 210     if (conn_node == NULL) {
 211         return purge;
 212     }
 213 
 214     /* Check the uptime of connection_rsc.  If it hasn't been running long
 215      * enough, set purge=true.  "Long enough" means it started running earlier
 216      * than the timestamp when we noticed it went away in the first place.
 217      */
 218     connection_rsc = lrm_state_find(node->uname);
 219 
 220     if (connection_rsc != NULL) {
 221         lrmd_t *lrm = connection_rsc->conn;
 222         time_t uptime = lrmd__uptime(lrm);
 223         time_t now = time(NULL);
 224 
 225         /* Add 20s of fuzziness to give corosync a while to notice the remote
 226          * host is gone.  On various error conditions (failure to get uptime,
 227          * peer_lost isn't set) we default to purging.
 228          */
 229         if (uptime > 0 &&
 230             conn_node->peer_lost > 0 &&
 231             uptime + 20 >= now - conn_node->peer_lost) {
 232             purge = false;
 233         }
 234     }
 235 
 236     return purge;
 237 }
 238 
 239 static enum controld_section_e
 240 section_to_delete(bool purge)
     /* [previous][next][first][last][top][bottom][index][help] */
 241 {
 242     if (pcmk_is_set(controld_globals.flags, controld_shutdown_lock_enabled)) {
 243         if (purge) {
 244             return controld_section_all_unlocked;
 245         } else {
 246             return controld_section_lrm_unlocked;
 247         }
 248     } else {
 249         if (purge) {
 250             return controld_section_all;
 251         } else {
 252             return controld_section_lrm;
 253         }
 254     }
 255 }
 256 
 257 static void
 258 purge_remote_node_attrs(int call_opt, crm_node_t *node)
     /* [previous][next][first][last][top][bottom][index][help] */
 259 {
 260     bool purge = should_purge_attributes(node);
 261     enum controld_section_e section = section_to_delete(purge);
 262 
 263     /* Purge node from attrd's memory */
 264     if (purge) {
 265         update_attrd_remote_node_removed(node->uname, NULL);
 266     }
 267 
 268     controld_delete_node_state(node->uname, section, call_opt);
 269 }
 270 
 271 /*!
 272  * \internal
 273  * \brief Handle cluster communication related to pacemaker_remote node joining
 274  *
 275  * \param[in] node_name  Name of newly integrated pacemaker_remote node
 276  */
 277 static void
 278 remote_node_up(const char *node_name)
     /* [previous][next][first][last][top][bottom][index][help] */
 279 {
 280     int call_opt;
 281     xmlNode *update, *state;
 282     crm_node_t *node;
 283     lrm_state_t *connection_rsc = NULL;
 284 
 285     CRM_CHECK(node_name != NULL, return);
 286     crm_info("Announcing Pacemaker Remote node %s", node_name);
 287 
 288     call_opt = crmd_cib_smart_opt();
 289 
 290     /* Delete node's probe_complete attribute. This serves two purposes:
 291      *
 292      * - @COMPAT DCs < 1.1.14 in a rolling upgrade might use it
 293      * - deleting it (or any attribute for that matter) here ensures the
 294      *   attribute manager learns the node is remote
 295      */
 296     update_attrd(node_name, CRM_OP_PROBED, NULL, NULL, TRUE);
 297 
 298     /* Ensure node is in the remote peer cache with member status */
 299     node = crm_remote_peer_get(node_name);
 300     CRM_CHECK(node != NULL, return);
 301 
 302     purge_remote_node_attrs(call_opt, node);
 303     pcmk__update_peer_state(__func__, node, CRM_NODE_MEMBER, 0);
 304 
 305     /* Apply any start state that we were given from the environment on the
 306      * remote node.
 307      */
 308     connection_rsc = lrm_state_find(node->uname);
 309 
 310     if (connection_rsc != NULL) {
 311         lrmd_t *lrm = connection_rsc->conn;
 312         const char *start_state = lrmd__node_start_state(lrm);
 313 
 314         if (start_state) {
 315             set_join_state(start_state, node->uname, node->uuid, true);
 316         }
 317     }
 318 
 319     /* pacemaker_remote nodes don't participate in the membership layer,
 320      * so cluster nodes don't automatically get notified when they come and go.
 321      * We send a cluster message to the DC, and update the CIB node state entry,
 322      * so the DC will get it sooner (via message) or later (via CIB refresh),
 323      * and any other interested parties can query the CIB.
 324      */
 325     broadcast_remote_state_message(node_name, true);
 326 
 327     update = create_xml_node(NULL, XML_CIB_TAG_STATUS);
 328     state = create_node_state_update(node, node_update_cluster, update,
 329                                      __func__);
 330 
 331     /* Clear the XML_NODE_IS_FENCED flag in the node state. If the node ever
 332      * needs to be fenced, this flag will allow various actions to determine
 333      * whether the fencing has happened yet.
 334      */
 335     crm_xml_add(state, XML_NODE_IS_FENCED, "0");
 336 
 337     /* TODO: If the remote connection drops, and this (async) CIB update either
 338      * failed or has not yet completed, later actions could mistakenly think the
 339      * node has already been fenced (if the XML_NODE_IS_FENCED attribute was
 340      * previously set, because it won't have been cleared). This could prevent
 341      * actual fencing or allow recurring monitor failures to be cleared too
 342      * soon. Ideally, we wouldn't rely on the CIB for the fenced status.
 343      */
 344     controld_update_cib(XML_CIB_TAG_STATUS, update, call_opt, NULL);
 345     free_xml(update);
 346 }
 347 
 348 enum down_opts {
 349     DOWN_KEEP_LRM,
 350     DOWN_ERASE_LRM
 351 };
 352 
 353 /*!
 354  * \internal
 355  * \brief Handle cluster communication related to pacemaker_remote node leaving
 356  *
 357  * \param[in] node_name  Name of lost node
 358  * \param[in] opts       Whether to keep or erase LRM history
 359  */
 360 static void
 361 remote_node_down(const char *node_name, const enum down_opts opts)
     /* [previous][next][first][last][top][bottom][index][help] */
 362 {
 363     xmlNode *update;
 364     int call_opt = crmd_cib_smart_opt();
 365     crm_node_t *node;
 366 
 367     /* Purge node from attrd's memory */
 368     update_attrd_remote_node_removed(node_name, NULL);
 369 
 370     /* Normally, only node attributes should be erased, and the resource history
 371      * should be kept until the node comes back up. However, after a successful
 372      * fence, we want to clear the history as well, so we don't think resources
 373      * are still running on the node.
 374      */
 375     if (opts == DOWN_ERASE_LRM) {
 376         controld_delete_node_state(node_name, controld_section_all, call_opt);
 377     } else {
 378         controld_delete_node_state(node_name, controld_section_attrs, call_opt);
 379     }
 380 
 381     /* Ensure node is in the remote peer cache with lost state */
 382     node = crm_remote_peer_get(node_name);
 383     CRM_CHECK(node != NULL, return);
 384     pcmk__update_peer_state(__func__, node, CRM_NODE_LOST, 0);
 385 
 386     /* Notify DC */
 387     broadcast_remote_state_message(node_name, false);
 388 
 389     /* Update CIB node state */
 390     update = create_xml_node(NULL, XML_CIB_TAG_STATUS);
 391     create_node_state_update(node, node_update_cluster, update, __func__);
 392     controld_update_cib(XML_CIB_TAG_STATUS, update, call_opt, NULL);
 393     free_xml(update);
 394 }
 395 
 396 /*!
 397  * \internal
 398  * \brief Handle effects of a remote RA command on node state
 399  *
 400  * \param[in] cmd  Completed remote RA command
 401  */
 402 static void
 403 check_remote_node_state(const remote_ra_cmd_t *cmd)
     /* [previous][next][first][last][top][bottom][index][help] */
 404 {
 405     /* Only successful actions can change node state */
 406     if (!pcmk__result_ok(&(cmd->result))) {
 407         return;
 408     }
 409 
 410     if (pcmk__str_eq(cmd->action, PCMK_ACTION_START, pcmk__str_casei)) {
 411         remote_node_up(cmd->rsc_id);
 412 
 413     } else if (pcmk__str_eq(cmd->action, PCMK_ACTION_MIGRATE_FROM,
 414                             pcmk__str_casei)) {
 415         /* After a successful migration, we don't need to do remote_node_up()
 416          * because the DC already knows the node is up, and we don't want to
 417          * clear LRM history etc. We do need to add the remote node to this
 418          * host's remote peer cache, because (unless it happens to be DC)
 419          * it hasn't been tracking the remote node, and other code relies on
 420          * the cache to distinguish remote nodes from unseen cluster nodes.
 421          */
 422         crm_node_t *node = crm_remote_peer_get(cmd->rsc_id);
 423 
 424         CRM_CHECK(node != NULL, return);
 425         pcmk__update_peer_state(__func__, node, CRM_NODE_MEMBER, 0);
 426 
 427     } else if (pcmk__str_eq(cmd->action, PCMK_ACTION_STOP, pcmk__str_casei)) {
 428         lrm_state_t *lrm_state = lrm_state_find(cmd->rsc_id);
 429         remote_ra_data_t *ra_data = lrm_state? lrm_state->remote_ra_data : NULL;
 430 
 431         if (ra_data) {
 432             if (!pcmk_is_set(ra_data->status, takeover_complete)) {
 433                 /* Stop means down if we didn't successfully migrate elsewhere */
 434                 remote_node_down(cmd->rsc_id, DOWN_KEEP_LRM);
 435             } else if (AM_I_DC == FALSE) {
 436                 /* Only the connection host and DC track node state,
 437                  * so if the connection migrated elsewhere and we aren't DC,
 438                  * un-cache the node, so we don't have stale info
 439                  */
 440                 crm_remote_peer_cache_remove(cmd->rsc_id);
 441             }
 442         }
 443     }
 444 
 445     /* We don't do anything for successful monitors, which is correct for
 446      * routine recurring monitors, and for monitors on nodes where the
 447      * connection isn't supposed to be (the cluster will stop the connection in
 448      * that case). However, if the initial probe finds the connection already
 449      * active on the node where we want it, we probably should do
 450      * remote_node_up(). Unfortunately, we can't distinguish that case here.
 451      * Given that connections have to be initiated by the cluster, the chance of
 452      * that should be close to zero.
 453      */
 454 }
 455 
 456 static void
 457 report_remote_ra_result(remote_ra_cmd_t * cmd)
     /* [previous][next][first][last][top][bottom][index][help] */
 458 {
 459     lrmd_event_data_t op = { 0, };
 460 
 461     check_remote_node_state(cmd);
 462 
 463     op.type = lrmd_event_exec_complete;
 464     op.rsc_id = cmd->rsc_id;
 465     op.op_type = cmd->action;
 466     op.user_data = cmd->userdata;
 467     op.timeout = cmd->timeout;
 468     op.interval_ms = cmd->interval_ms;
 469     op.t_run = (unsigned int) cmd->start_time;
 470     op.t_rcchange = (unsigned int) cmd->start_time;
 471 
 472     lrmd__set_result(&op, cmd->result.exit_status, cmd->result.execution_status,
 473                      cmd->result.exit_reason);
 474 
 475     if (pcmk_is_set(cmd->status, cmd_reported_success) && !pcmk__result_ok(&(cmd->result))) {
 476         op.t_rcchange = (unsigned int) time(NULL);
 477         /* This edge case will likely never ever occur, but if it does the
 478          * result is that a failure will not be processed correctly. This is only
 479          * remotely possible because we are able to detect a connection resource's tcp
 480          * connection has failed at any moment after start has completed. The actual
 481          * recurring operation is just a connectivity ping.
 482          *
 483          * basically, we are not guaranteed that the first successful monitor op and
 484          * a subsequent failed monitor op will not occur in the same timestamp. We have to
 485          * make it look like the operations occurred at separate times though. */
 486         if (op.t_rcchange == op.t_run) {
 487             op.t_rcchange++;
 488         }
 489     }
 490 
 491     if (cmd->params) {
 492         lrmd_key_value_t *tmp;
 493 
 494         op.params = pcmk__strkey_table(free, free);
 495         for (tmp = cmd->params; tmp; tmp = tmp->next) {
 496             g_hash_table_insert(op.params, strdup(tmp->key), strdup(tmp->value));
 497         }
 498 
 499     }
 500     op.call_id = cmd->call_id;
 501     op.remote_nodename = cmd->owner;
 502 
 503     lrm_op_callback(&op);
 504 
 505     if (op.params) {
 506         g_hash_table_destroy(op.params);
 507     }
 508     lrmd__reset_result(&op);
 509 }
 510 
 511 static void
 512 update_remaining_timeout(remote_ra_cmd_t * cmd)
     /* [previous][next][first][last][top][bottom][index][help] */
 513 {
 514     cmd->remaining_timeout = ((cmd->timeout / 1000) - (time(NULL) - cmd->start_time)) * 1000;
 515 }
 516 
 517 static gboolean
 518 retry_start_cmd_cb(gpointer data)
     /* [previous][next][first][last][top][bottom][index][help] */
 519 {
 520     lrm_state_t *lrm_state = data;
 521     remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
 522     remote_ra_cmd_t *cmd = NULL;
 523     int rc = ETIME;
 524 
 525     if (!ra_data || !ra_data->cur_cmd) {
 526         return FALSE;
 527     }
 528     cmd = ra_data->cur_cmd;
 529     if (!pcmk__strcase_any_of(cmd->action, PCMK_ACTION_START,
 530                               PCMK_ACTION_MIGRATE_FROM, NULL)) {
 531         return FALSE;
 532     }
 533     update_remaining_timeout(cmd);
 534 
 535     if (cmd->remaining_timeout > 0) {
 536         rc = handle_remote_ra_start(lrm_state, cmd, cmd->remaining_timeout);
 537     } else {
 538         pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR,
 539                          PCMK_EXEC_TIMEOUT,
 540                          "Not enough time remains to retry remote connection");
 541     }
 542 
 543     if (rc != pcmk_rc_ok) {
 544         report_remote_ra_result(cmd);
 545 
 546         if (ra_data->cmds) {
 547             mainloop_set_trigger(ra_data->work);
 548         }
 549         ra_data->cur_cmd = NULL;
 550         free_cmd(cmd);
 551     } else {
 552         /* wait for connection event */
 553     }
 554 
 555     return FALSE;
 556 }
 557 
 558 
 559 static gboolean
 560 connection_takeover_timeout_cb(gpointer data)
     /* [previous][next][first][last][top][bottom][index][help] */
 561 {
 562     lrm_state_t *lrm_state = NULL;
 563     remote_ra_cmd_t *cmd = data;
 564 
 565     crm_info("takeover event timed out for node %s", cmd->rsc_id);
 566     cmd->takeover_timeout_id = 0;
 567 
 568     lrm_state = lrm_state_find(cmd->rsc_id);
 569 
 570     handle_remote_ra_stop(lrm_state, cmd);
 571     free_cmd(cmd);
 572 
 573     return FALSE;
 574 }
 575 
 576 static gboolean
 577 monitor_timeout_cb(gpointer data)
     /* [previous][next][first][last][top][bottom][index][help] */
 578 {
 579     lrm_state_t *lrm_state = NULL;
 580     remote_ra_cmd_t *cmd = data;
 581 
 582     lrm_state = lrm_state_find(cmd->rsc_id);
 583 
 584     crm_info("Timed out waiting for remote poke response from %s%s",
 585              cmd->rsc_id, (lrm_state? "" : " (no LRM state)"));
 586     cmd->monitor_timeout_id = 0;
 587     pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR, PCMK_EXEC_TIMEOUT,
 588                      "Remote executor did not respond");
 589 
 590     if (lrm_state && lrm_state->remote_ra_data) {
 591         remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
 592 
 593         if (ra_data->cur_cmd == cmd) {
 594             ra_data->cur_cmd = NULL;
 595         }
 596         if (ra_data->cmds) {
 597             mainloop_set_trigger(ra_data->work);
 598         }
 599     }
 600 
 601     report_remote_ra_result(cmd);
 602     free_cmd(cmd);
 603 
 604     if(lrm_state) {
 605         lrm_state_disconnect(lrm_state);
 606     }
 607     return FALSE;
 608 }
 609 
 610 static void
 611 synthesize_lrmd_success(lrm_state_t *lrm_state, const char *rsc_id, const char *op_type)
     /* [previous][next][first][last][top][bottom][index][help] */
 612 {
 613     lrmd_event_data_t op = { 0, };
 614 
 615     if (lrm_state == NULL) {
 616         /* if lrm_state not given assume local */
 617         lrm_state = lrm_state_find(controld_globals.our_nodename);
 618     }
 619     CRM_ASSERT(lrm_state != NULL);
 620 
 621     op.type = lrmd_event_exec_complete;
 622     op.rsc_id = rsc_id;
 623     op.op_type = op_type;
 624     op.t_run = (unsigned int) time(NULL);
 625     op.t_rcchange = op.t_run;
 626     op.call_id = generate_callid();
 627     lrmd__set_result(&op, PCMK_OCF_OK, PCMK_EXEC_DONE, NULL);
 628     process_lrm_event(lrm_state, &op, NULL, NULL);
 629 }
 630 
 631 void
 632 remote_lrm_op_callback(lrmd_event_data_t * op)
     /* [previous][next][first][last][top][bottom][index][help] */
 633 {
 634     gboolean cmd_handled = FALSE;
 635     lrm_state_t *lrm_state = NULL;
 636     remote_ra_data_t *ra_data = NULL;
 637     remote_ra_cmd_t *cmd = NULL;
 638 
 639     crm_debug("Processing '%s%s%s' event on remote connection to %s: %s "
 640               "(%d) status=%s (%d)",
 641               (op->op_type? op->op_type : ""), (op->op_type? " " : ""),
 642               lrmd_event_type2str(op->type), op->remote_nodename,
 643               services_ocf_exitcode_str(op->rc), op->rc,
 644               pcmk_exec_status_str(op->op_status), op->op_status);
 645 
 646     lrm_state = lrm_state_find(op->remote_nodename);
 647     if (!lrm_state || !lrm_state->remote_ra_data) {
 648         crm_debug("No state information found for remote connection event");
 649         return;
 650     }
 651     ra_data = lrm_state->remote_ra_data;
 652 
 653     if (op->type == lrmd_event_new_client) {
 654         // Another client has connected to the remote daemon
 655 
 656         if (pcmk_is_set(ra_data->status, expect_takeover)) {
 657             // Great, we knew this was coming
 658             lrm_remote_clear_flags(lrm_state, expect_takeover);
 659             lrm_remote_set_flags(lrm_state, takeover_complete);
 660 
 661         } else {
 662             crm_err("Disconnecting from Pacemaker Remote node %s due to "
 663                     "unexpected client takeover", op->remote_nodename);
 664             /* In this case, lrmd_tls_connection_destroy() will be called under the control of mainloop. */
 665             /* Do not free lrm_state->conn yet. */
 666             /* It'll be freed in the following stop action. */
 667             lrm_state_disconnect_only(lrm_state);
 668         }
 669         return;
 670     }
 671 
 672     /* filter all EXEC events up */
 673     if (op->type == lrmd_event_exec_complete) {
 674         if (pcmk_is_set(ra_data->status, takeover_complete)) {
 675             crm_debug("ignoring event, this connection is taken over by another node");
 676         } else {
 677             lrm_op_callback(op);
 678         }
 679         return;
 680     }
 681 
 682     if ((op->type == lrmd_event_disconnect) && (ra_data->cur_cmd == NULL)) {
 683 
 684         if (!pcmk_is_set(ra_data->status, remote_active)) {
 685             crm_debug("Disconnection from Pacemaker Remote node %s complete",
 686                       lrm_state->node_name);
 687 
 688         } else if (!remote_ra_is_in_maintenance(lrm_state)) {
 689             crm_err("Lost connection to Pacemaker Remote node %s",
 690                     lrm_state->node_name);
 691             ra_data->recurring_cmds = fail_all_monitor_cmds(ra_data->recurring_cmds);
 692             ra_data->cmds = fail_all_monitor_cmds(ra_data->cmds);
 693 
 694         } else {
 695             crm_notice("Unmanaged Pacemaker Remote node %s disconnected",
 696                        lrm_state->node_name);
 697             /* Do roughly what a 'stop' on the remote-resource would do */
 698             handle_remote_ra_stop(lrm_state, NULL);
 699             remote_node_down(lrm_state->node_name, DOWN_KEEP_LRM);
 700             /* now fake the reply of a successful 'stop' */
 701             synthesize_lrmd_success(NULL, lrm_state->node_name,
 702                                     PCMK_ACTION_STOP);
 703         }
 704         return;
 705     }
 706 
 707     if (!ra_data->cur_cmd) {
 708         crm_debug("no event to match");
 709         return;
 710     }
 711 
 712     cmd = ra_data->cur_cmd;
 713 
 714     /* Start actions and migrate from actions complete after connection
 715      * comes back to us. */
 716     if ((op->type == lrmd_event_connect)
 717         && pcmk__strcase_any_of(cmd->action, PCMK_ACTION_START,
 718                                 PCMK_ACTION_MIGRATE_FROM, NULL)) {
 719         if (op->connection_rc < 0) {
 720             update_remaining_timeout(cmd);
 721 
 722             if ((op->connection_rc == -ENOKEY)
 723                 || (op->connection_rc == -EKEYREJECTED)) {
 724                 // Hard error, don't retry
 725                 pcmk__set_result(&(cmd->result), PCMK_OCF_INVALID_PARAM,
 726                                  PCMK_EXEC_ERROR,
 727                                  pcmk_strerror(op->connection_rc));
 728 
 729             } else if (cmd->remaining_timeout > 3000) {
 730                 crm_trace("rescheduling start, remaining timeout %d", cmd->remaining_timeout);
 731                 g_timeout_add(1000, retry_start_cmd_cb, lrm_state);
 732                 return;
 733 
 734             } else {
 735                 crm_trace("can't reschedule start, remaining timeout too small %d",
 736                           cmd->remaining_timeout);
 737                 pcmk__format_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR,
 738                                     PCMK_EXEC_TIMEOUT,
 739                                     "%s without enough time to retry",
 740                                     pcmk_strerror(op->connection_rc));
 741             }
 742 
 743         } else {
 744             lrm_state_reset_tables(lrm_state, TRUE);
 745             pcmk__set_result(&(cmd->result), PCMK_OCF_OK, PCMK_EXEC_DONE, NULL);
 746             lrm_remote_set_flags(lrm_state, remote_active);
 747         }
 748 
 749         crm_debug("Remote connection event matched %s action", cmd->action);
 750         report_remote_ra_result(cmd);
 751         cmd_handled = TRUE;
 752 
 753     } else if ((op->type == lrmd_event_poke)
 754                && pcmk__str_eq(cmd->action, PCMK_ACTION_MONITOR,
 755                                pcmk__str_casei)) {
 756 
 757         if (cmd->monitor_timeout_id) {
 758             g_source_remove(cmd->monitor_timeout_id);
 759             cmd->monitor_timeout_id = 0;
 760         }
 761 
 762         /* Only report success the first time, after that only worry about failures.
 763          * For this function, if we get the poke pack, it is always a success. Pokes
 764          * only fail if the send fails, or the response times out. */
 765         if (!pcmk_is_set(cmd->status, cmd_reported_success)) {
 766             pcmk__set_result(&(cmd->result), PCMK_OCF_OK, PCMK_EXEC_DONE, NULL);
 767             report_remote_ra_result(cmd);
 768             cmd_set_flags(cmd, cmd_reported_success);
 769         }
 770 
 771         crm_debug("Remote poke event matched %s action", cmd->action);
 772 
 773         /* success, keep rescheduling if interval is present. */
 774         if (cmd->interval_ms && !pcmk_is_set(cmd->status, cmd_cancel)) {
 775             ra_data->recurring_cmds = g_list_append(ra_data->recurring_cmds, cmd);
 776             cmd->interval_id = g_timeout_add(cmd->interval_ms,
 777                                              recurring_helper, cmd);
 778             cmd = NULL;         /* prevent free */
 779         }
 780         cmd_handled = TRUE;
 781 
 782     } else if ((op->type == lrmd_event_disconnect)
 783                && pcmk__str_eq(cmd->action, PCMK_ACTION_MONITOR,
 784                                pcmk__str_casei)) {
 785         if (pcmk_is_set(ra_data->status, remote_active) &&
 786             !pcmk_is_set(cmd->status, cmd_cancel)) {
 787             pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR,
 788                              PCMK_EXEC_ERROR,
 789                              "Remote connection unexpectedly dropped "
 790                              "during monitor");
 791             report_remote_ra_result(cmd);
 792             crm_err("Remote connection to %s unexpectedly dropped during monitor",
 793                     lrm_state->node_name);
 794         }
 795         cmd_handled = TRUE;
 796 
 797     } else if ((op->type == lrmd_event_new_client)
 798                && pcmk__str_eq(cmd->action, PCMK_ACTION_STOP,
 799                                pcmk__str_casei)) {
 800 
 801         handle_remote_ra_stop(lrm_state, cmd);
 802         cmd_handled = TRUE;
 803 
 804     } else {
 805         crm_debug("Event did not match %s action", ra_data->cur_cmd->action);
 806     }
 807 
 808     if (cmd_handled) {
 809         ra_data->cur_cmd = NULL;
 810         if (ra_data->cmds) {
 811             mainloop_set_trigger(ra_data->work);
 812         }
 813         free_cmd(cmd);
 814     }
 815 }
 816 
 817 static void
 818 handle_remote_ra_stop(lrm_state_t * lrm_state, remote_ra_cmd_t * cmd)
     /* [previous][next][first][last][top][bottom][index][help] */
 819 {
 820     remote_ra_data_t *ra_data = NULL;
 821 
 822     CRM_ASSERT(lrm_state);
 823     ra_data = lrm_state->remote_ra_data;
 824 
 825     if (!pcmk_is_set(ra_data->status, takeover_complete)) {
 826         /* delete pending ops when ever the remote connection is intentionally stopped */
 827         g_hash_table_remove_all(lrm_state->active_ops);
 828     } else {
 829         /* we no longer hold the history if this connection has been migrated,
 830          * however, we keep metadata cache for future use */
 831         lrm_state_reset_tables(lrm_state, FALSE);
 832     }
 833 
 834     lrm_remote_clear_flags(lrm_state, remote_active);
 835     lrm_state_disconnect(lrm_state);
 836 
 837     if (ra_data->cmds) {
 838         g_list_free_full(ra_data->cmds, free_cmd);
 839     }
 840     if (ra_data->recurring_cmds) {
 841         g_list_free_full(ra_data->recurring_cmds, free_cmd);
 842     }
 843     ra_data->cmds = NULL;
 844     ra_data->recurring_cmds = NULL;
 845     ra_data->cur_cmd = NULL;
 846 
 847     if (cmd) {
 848         pcmk__set_result(&(cmd->result), PCMK_OCF_OK, PCMK_EXEC_DONE, NULL);
 849         report_remote_ra_result(cmd);
 850     }
 851 }
 852 
 853 // \return Standard Pacemaker return code
 854 static int
 855 handle_remote_ra_start(lrm_state_t * lrm_state, remote_ra_cmd_t * cmd, int timeout_ms)
     /* [previous][next][first][last][top][bottom][index][help] */
 856 {
 857     const char *server = NULL;
 858     lrmd_key_value_t *tmp = NULL;
 859     int port = 0;
 860     int timeout_used = timeout_ms > MAX_START_TIMEOUT_MS ? MAX_START_TIMEOUT_MS : timeout_ms;
 861     int rc = pcmk_rc_ok;
 862 
 863     for (tmp = cmd->params; tmp; tmp = tmp->next) {
 864         if (pcmk__strcase_any_of(tmp->key, XML_RSC_ATTR_REMOTE_RA_ADDR,
 865                                  XML_RSC_ATTR_REMOTE_RA_SERVER, NULL)) {
 866             server = tmp->value;
 867         } else if (pcmk__str_eq(tmp->key, XML_RSC_ATTR_REMOTE_RA_PORT, pcmk__str_casei)) {
 868             port = atoi(tmp->value);
 869         } else if (pcmk__str_eq(tmp->key, CRM_META "_" XML_RSC_ATTR_CONTAINER, pcmk__str_casei)) {
 870             lrm_remote_set_flags(lrm_state, controlling_guest);
 871         }
 872     }
 873 
 874     rc = controld_connect_remote_executor(lrm_state, server, port,
 875                                           timeout_used);
 876     if (rc != pcmk_rc_ok) {
 877         pcmk__format_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR,
 878                             PCMK_EXEC_ERROR,
 879                             "Could not connect to Pacemaker Remote node %s: %s",
 880                             lrm_state->node_name, pcmk_rc_str(rc));
 881     }
 882     return rc;
 883 }
 884 
 885 static gboolean
 886 handle_remote_ra_exec(gpointer user_data)
     /* [previous][next][first][last][top][bottom][index][help] */
 887 {
 888     int rc = 0;
 889     lrm_state_t *lrm_state = user_data;
 890     remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
 891     remote_ra_cmd_t *cmd;
 892     GList *first = NULL;
 893 
 894     if (ra_data->cur_cmd) {
 895         /* still waiting on previous cmd */
 896         return TRUE;
 897     }
 898 
 899     while (ra_data->cmds) {
 900         first = ra_data->cmds;
 901         cmd = first->data;
 902         if (cmd->delay_id) {
 903             /* still waiting for start delay timer to trip */
 904             return TRUE;
 905         }
 906 
 907         ra_data->cmds = g_list_remove_link(ra_data->cmds, first);
 908         g_list_free_1(first);
 909 
 910         if (pcmk__str_any_of(cmd->action, PCMK_ACTION_START,
 911                              PCMK_ACTION_MIGRATE_FROM, NULL)) {
 912             lrm_remote_clear_flags(lrm_state, expect_takeover | takeover_complete);
 913             if (handle_remote_ra_start(lrm_state, cmd,
 914                                        cmd->timeout) == pcmk_rc_ok) {
 915                 /* take care of this later when we get async connection result */
 916                 crm_debug("Initiated async remote connection, %s action will complete after connect event",
 917                           cmd->action);
 918                 ra_data->cur_cmd = cmd;
 919                 return TRUE;
 920             }
 921             report_remote_ra_result(cmd);
 922 
 923         } else if (!strcmp(cmd->action, PCMK_ACTION_MONITOR)) {
 924 
 925             if (lrm_state_is_connected(lrm_state) == TRUE) {
 926                 rc = lrm_state_poke_connection(lrm_state);
 927                 if (rc < 0) {
 928                     pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR,
 929                                      PCMK_EXEC_ERROR, pcmk_strerror(rc));
 930                 }
 931             } else {
 932                 rc = -1;
 933                 pcmk__set_result(&(cmd->result), PCMK_OCF_NOT_RUNNING,
 934                                  PCMK_EXEC_DONE, "Remote connection inactive");
 935             }
 936 
 937             if (rc == 0) {
 938                 crm_debug("Poked Pacemaker Remote at node %s, waiting for async response",
 939                           cmd->rsc_id);
 940                 ra_data->cur_cmd = cmd;
 941                 cmd->monitor_timeout_id = g_timeout_add(cmd->timeout, monitor_timeout_cb, cmd);
 942                 return TRUE;
 943             }
 944             report_remote_ra_result(cmd);
 945 
 946         } else if (!strcmp(cmd->action, PCMK_ACTION_STOP)) {
 947 
 948             if (pcmk_is_set(ra_data->status, expect_takeover)) {
 949                 /* briefly wait on stop for the takeover event to occur. If the
 950                  * takeover event does not occur during the wait period, that's fine.
 951                  * It just means that the remote-node's lrm_status section is going to get
 952                  * cleared which will require all the resources running in the remote-node
 953                  * to be explicitly re-detected via probe actions.  If the takeover does occur
 954                  * successfully, then we can leave the status section intact. */
 955                 cmd->takeover_timeout_id = g_timeout_add((cmd->timeout/2), connection_takeover_timeout_cb, cmd);
 956                 ra_data->cur_cmd = cmd;
 957                 return TRUE;
 958             }
 959 
 960             handle_remote_ra_stop(lrm_state, cmd);
 961 
 962         } else if (strcmp(cmd->action, PCMK_ACTION_MIGRATE_TO) == 0) {
 963             lrm_remote_clear_flags(lrm_state, takeover_complete);
 964             lrm_remote_set_flags(lrm_state, expect_takeover);
 965             pcmk__set_result(&(cmd->result), PCMK_OCF_OK, PCMK_EXEC_DONE, NULL);
 966             report_remote_ra_result(cmd);
 967 
 968         } else if (pcmk__str_any_of(cmd->action, PCMK_ACTION_RELOAD,
 969                                     PCMK_ACTION_RELOAD_AGENT, NULL))  {
 970             /* Currently the only reloadable parameter is reconnect_interval,
 971              * which is only used by the scheduler via the CIB, so reloads are a
 972              * no-op.
 973              *
 974              * @COMPAT DC <2.1.0: We only need to check for "reload" in case
 975              * we're in a rolling upgrade with a DC scheduling "reload" instead
 976              * of "reload-agent". An OCF 1.1 "reload" would be a no-op anyway,
 977              * so this would work for that purpose as well.
 978              */
 979             pcmk__set_result(&(cmd->result), PCMK_OCF_OK, PCMK_EXEC_DONE, NULL);
 980             report_remote_ra_result(cmd);
 981         }
 982 
 983         free_cmd(cmd);
 984     }
 985 
 986     return TRUE;
 987 }
 988 
 989 static void
 990 remote_ra_data_init(lrm_state_t * lrm_state)
     /* [previous][next][first][last][top][bottom][index][help] */
 991 {
 992     remote_ra_data_t *ra_data = NULL;
 993 
 994     if (lrm_state->remote_ra_data) {
 995         return;
 996     }
 997 
 998     ra_data = calloc(1, sizeof(remote_ra_data_t));
 999     ra_data->work = mainloop_add_trigger(G_PRIORITY_HIGH, handle_remote_ra_exec, lrm_state);
1000     lrm_state->remote_ra_data = ra_data;
1001 }
1002 
1003 void
1004 remote_ra_cleanup(lrm_state_t * lrm_state)
     /* [previous][next][first][last][top][bottom][index][help] */
1005 {
1006     remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
1007 
1008     if (!ra_data) {
1009         return;
1010     }
1011 
1012     if (ra_data->cmds) {
1013         g_list_free_full(ra_data->cmds, free_cmd);
1014     }
1015 
1016     if (ra_data->recurring_cmds) {
1017         g_list_free_full(ra_data->recurring_cmds, free_cmd);
1018     }
1019     mainloop_destroy_trigger(ra_data->work);
1020     free(ra_data);
1021     lrm_state->remote_ra_data = NULL;
1022 }
1023 
1024 gboolean
1025 is_remote_lrmd_ra(const char *agent, const char *provider, const char *id)
     /* [previous][next][first][last][top][bottom][index][help] */
1026 {
1027     if (agent && provider && !strcmp(agent, REMOTE_LRMD_RA) && !strcmp(provider, "pacemaker")) {
1028         return TRUE;
1029     }
1030     if ((id != NULL) && (lrm_state_find(id) != NULL)
1031         && !pcmk__str_eq(id, controld_globals.our_nodename, pcmk__str_casei)) {
1032         return TRUE;
1033     }
1034 
1035     return FALSE;
1036 }
1037 
1038 lrmd_rsc_info_t *
1039 remote_ra_get_rsc_info(lrm_state_t * lrm_state, const char *rsc_id)
     /* [previous][next][first][last][top][bottom][index][help] */
1040 {
1041     lrmd_rsc_info_t *info = NULL;
1042 
1043     if ((lrm_state_find(rsc_id))) {
1044         info = calloc(1, sizeof(lrmd_rsc_info_t));
1045 
1046         info->id = strdup(rsc_id);
1047         info->type = strdup(REMOTE_LRMD_RA);
1048         info->standard = strdup(PCMK_RESOURCE_CLASS_OCF);
1049         info->provider = strdup("pacemaker");
1050     }
1051 
1052     return info;
1053 }
1054 
1055 static gboolean
1056 is_remote_ra_supported_action(const char *action)
     /* [previous][next][first][last][top][bottom][index][help] */
1057 {
1058     return pcmk__str_any_of(action,
1059                             PCMK_ACTION_START,
1060                             PCMK_ACTION_STOP,
1061                             PCMK_ACTION_MONITOR,
1062                             PCMK_ACTION_MIGRATE_TO,
1063                             PCMK_ACTION_MIGRATE_FROM,
1064                             PCMK_ACTION_RELOAD_AGENT,
1065                             PCMK_ACTION_RELOAD,
1066                             NULL);
1067 }
1068 
1069 static GList *
1070 fail_all_monitor_cmds(GList * list)
     /* [previous][next][first][last][top][bottom][index][help] */
1071 {
1072     GList *rm_list = NULL;
1073     remote_ra_cmd_t *cmd = NULL;
1074     GList *gIter = NULL;
1075 
1076     for (gIter = list; gIter != NULL; gIter = gIter->next) {
1077         cmd = gIter->data;
1078         if ((cmd->interval_ms > 0)
1079             && pcmk__str_eq(cmd->action, PCMK_ACTION_MONITOR,
1080                             pcmk__str_casei)) {
1081             rm_list = g_list_append(rm_list, cmd);
1082         }
1083     }
1084 
1085     for (gIter = rm_list; gIter != NULL; gIter = gIter->next) {
1086         cmd = gIter->data;
1087 
1088         pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR,
1089                          PCMK_EXEC_ERROR, "Lost connection to remote executor");
1090         crm_trace("Pre-emptively failing %s %s (interval=%u, %s)",
1091                   cmd->action, cmd->rsc_id, cmd->interval_ms, cmd->userdata);
1092         report_remote_ra_result(cmd);
1093 
1094         list = g_list_remove(list, cmd);
1095         free_cmd(cmd);
1096     }
1097 
1098     /* frees only the list data, not the cmds */
1099     g_list_free(rm_list);
1100     return list;
1101 }
1102 
1103 static GList *
1104 remove_cmd(GList * list, const char *action, guint interval_ms)
     /* [previous][next][first][last][top][bottom][index][help] */
1105 {
1106     remote_ra_cmd_t *cmd = NULL;
1107     GList *gIter = NULL;
1108 
1109     for (gIter = list; gIter != NULL; gIter = gIter->next) {
1110         cmd = gIter->data;
1111         if ((cmd->interval_ms == interval_ms)
1112             && pcmk__str_eq(cmd->action, action, pcmk__str_casei)) {
1113             break;
1114         }
1115         cmd = NULL;
1116     }
1117     if (cmd) {
1118         list = g_list_remove(list, cmd);
1119         free_cmd(cmd);
1120     }
1121     return list;
1122 }
1123 
1124 int
1125 remote_ra_cancel(lrm_state_t *lrm_state, const char *rsc_id,
     /* [previous][next][first][last][top][bottom][index][help] */
1126                  const char *action, guint interval_ms)
1127 {
1128     lrm_state_t *connection_rsc = NULL;
1129     remote_ra_data_t *ra_data = NULL;
1130 
1131     connection_rsc = lrm_state_find(rsc_id);
1132     if (!connection_rsc || !connection_rsc->remote_ra_data) {
1133         return -EINVAL;
1134     }
1135 
1136     ra_data = connection_rsc->remote_ra_data;
1137     ra_data->cmds = remove_cmd(ra_data->cmds, action, interval_ms);
1138     ra_data->recurring_cmds = remove_cmd(ra_data->recurring_cmds, action,
1139                                          interval_ms);
1140     if (ra_data->cur_cmd &&
1141         (ra_data->cur_cmd->interval_ms == interval_ms) &&
1142         (pcmk__str_eq(ra_data->cur_cmd->action, action, pcmk__str_casei))) {
1143 
1144         cmd_set_flags(ra_data->cur_cmd, cmd_cancel);
1145     }
1146 
1147     return 0;
1148 }
1149 
1150 static remote_ra_cmd_t *
1151 handle_dup_monitor(remote_ra_data_t *ra_data, guint interval_ms,
     /* [previous][next][first][last][top][bottom][index][help] */
1152                    const char *userdata)
1153 {
1154     GList *gIter = NULL;
1155     remote_ra_cmd_t *cmd = NULL;
1156 
1157     /* there are 3 places a potential duplicate monitor operation
1158      * could exist.
1159      * 1. recurring_cmds list. where the op is waiting for its next interval
1160      * 2. cmds list, where the op is queued to get executed immediately
1161      * 3. cur_cmd, which means the monitor op is in flight right now.
1162      */
1163     if (interval_ms == 0) {
1164         return NULL;
1165     }
1166 
1167     if (ra_data->cur_cmd &&
1168         !pcmk_is_set(ra_data->cur_cmd->status, cmd_cancel) &&
1169         (ra_data->cur_cmd->interval_ms == interval_ms)
1170         && pcmk__str_eq(ra_data->cur_cmd->action, PCMK_ACTION_MONITOR,
1171                         pcmk__str_casei)) {
1172 
1173         cmd = ra_data->cur_cmd;
1174         goto handle_dup;
1175     }
1176 
1177     for (gIter = ra_data->recurring_cmds; gIter != NULL; gIter = gIter->next) {
1178         cmd = gIter->data;
1179         if ((cmd->interval_ms == interval_ms)
1180             && pcmk__str_eq(cmd->action, PCMK_ACTION_MONITOR,
1181                             pcmk__str_casei)) {
1182             goto handle_dup;
1183         }
1184     }
1185 
1186     for (gIter = ra_data->cmds; gIter != NULL; gIter = gIter->next) {
1187         cmd = gIter->data;
1188         if ((cmd->interval_ms == interval_ms)
1189             && pcmk__str_eq(cmd->action, PCMK_ACTION_MONITOR,
1190                             pcmk__str_casei)) {
1191             goto handle_dup;
1192         }
1193     }
1194 
1195     return NULL;
1196 
1197 handle_dup:
1198 
1199     crm_trace("merging duplicate monitor cmd " PCMK__OP_FMT,
1200               cmd->rsc_id, PCMK_ACTION_MONITOR, interval_ms);
1201 
1202     /* update the userdata */
1203     if (userdata) {
1204        free(cmd->userdata);
1205        cmd->userdata = strdup(userdata);
1206     }
1207 
1208     /* if we've already reported success, generate a new call id */
1209     if (pcmk_is_set(cmd->status, cmd_reported_success)) {
1210         cmd->start_time = time(NULL);
1211         cmd->call_id = generate_callid();
1212         cmd_clear_flags(cmd, cmd_reported_success);
1213     }
1214 
1215     /* if we have an interval_id set, that means we are in the process of
1216      * waiting for this cmd's next interval. instead of waiting, cancel
1217      * the timer and execute the action immediately */
1218     if (cmd->interval_id) {
1219         g_source_remove(cmd->interval_id);
1220         cmd->interval_id = 0;
1221         recurring_helper(cmd);
1222     }
1223 
1224     return cmd;
1225 }
1226 
1227 /*!
1228  * \internal
1229  * \brief Execute an action using the (internal) ocf:pacemaker:remote agent
1230  *
1231  * \param[in]     lrm_state      Executor state object for remote connection
1232  * \param[in]     rsc_id         Connection resource ID
1233  * \param[in]     action         Action to execute
1234  * \param[in]     userdata       String to copy and pass to execution callback
1235  * \param[in]     interval_ms    Action interval (in milliseconds)
1236  * \param[in]     timeout_ms     Action timeout (in milliseconds)
1237  * \param[in]     start_delay_ms Delay (in milliseconds) before executing action
1238  * \param[in,out] params         Connection resource parameters
1239  * \param[out]    call_id        Where to store call ID on success
1240  *
1241  * \return Standard Pacemaker return code
1242  * \note This takes ownership of \p params, which should not be used or freed
1243  *       after calling this function.
1244  */
1245 int
1246 controld_execute_remote_agent(const lrm_state_t *lrm_state, const char *rsc_id,
     /* [previous][next][first][last][top][bottom][index][help] */
1247                               const char *action, const char *userdata,
1248                               guint interval_ms, int timeout_ms,
1249                               int start_delay_ms, lrmd_key_value_t *params,
1250                               int *call_id)
1251 {
1252     lrm_state_t *connection_rsc = NULL;
1253     remote_ra_cmd_t *cmd = NULL;
1254     remote_ra_data_t *ra_data = NULL;
1255 
1256     *call_id = 0;
1257 
1258     CRM_CHECK((lrm_state != NULL) && (rsc_id != NULL) && (action != NULL)
1259               && (userdata != NULL) && (call_id != NULL),
1260               lrmd_key_value_freeall(params); return EINVAL);
1261 
1262     if (!is_remote_ra_supported_action(action)) {
1263         lrmd_key_value_freeall(params);
1264         return EOPNOTSUPP;
1265     }
1266 
1267     connection_rsc = lrm_state_find(rsc_id);
1268     if (connection_rsc == NULL) {
1269         lrmd_key_value_freeall(params);
1270         return ENOTCONN;
1271     }
1272 
1273     remote_ra_data_init(connection_rsc);
1274     ra_data = connection_rsc->remote_ra_data;
1275 
1276     cmd = handle_dup_monitor(ra_data, interval_ms, userdata);
1277     if (cmd) {
1278         *call_id = cmd->call_id;
1279         lrmd_key_value_freeall(params);
1280         return pcmk_rc_ok;
1281     }
1282 
1283     cmd = calloc(1, sizeof(remote_ra_cmd_t));
1284     if (cmd == NULL) {
1285         lrmd_key_value_freeall(params);
1286         return ENOMEM;
1287     }
1288 
1289     cmd->owner = strdup(lrm_state->node_name);
1290     cmd->rsc_id = strdup(rsc_id);
1291     cmd->action = strdup(action);
1292     cmd->userdata = strdup(userdata);
1293     if ((cmd->owner == NULL) || (cmd->rsc_id == NULL) || (cmd->action == NULL)
1294         || (cmd->userdata == NULL)) {
1295         free_cmd(cmd);
1296         lrmd_key_value_freeall(params);
1297         return ENOMEM;
1298     }
1299 
1300     cmd->interval_ms = interval_ms;
1301     cmd->timeout = timeout_ms;
1302     cmd->start_delay = start_delay_ms;
1303     cmd->params = params;
1304     cmd->start_time = time(NULL);
1305 
1306     cmd->call_id = generate_callid();
1307 
1308     if (cmd->start_delay) {
1309         cmd->delay_id = g_timeout_add(cmd->start_delay, start_delay_helper, cmd);
1310     }
1311 
1312     ra_data->cmds = g_list_append(ra_data->cmds, cmd);
1313     mainloop_set_trigger(ra_data->work);
1314 
1315     *call_id = cmd->call_id;
1316     return pcmk_rc_ok;
1317 }
1318 
1319 /*!
1320  * \internal
1321  * \brief Immediately fail all monitors of a remote node, if proxied here
1322  *
1323  * \param[in] node_name  Name of pacemaker_remote node
1324  */
1325 void
1326 remote_ra_fail(const char *node_name)
     /* [previous][next][first][last][top][bottom][index][help] */
1327 {
1328     lrm_state_t *lrm_state = lrm_state_find(node_name);
1329 
1330     if (lrm_state && lrm_state_is_connected(lrm_state)) {
1331         remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
1332 
1333         crm_info("Failing monitors on Pacemaker Remote node %s", node_name);
1334         ra_data->recurring_cmds = fail_all_monitor_cmds(ra_data->recurring_cmds);
1335         ra_data->cmds = fail_all_monitor_cmds(ra_data->cmds);
1336     }
1337 }
1338 
1339 /* A guest node fencing implied by host fencing looks like:
1340  *
1341  *  <pseudo_event id="103" operation="stonith" operation_key="stonith-lxc1-off"
1342  *                on_node="lxc1" on_node_uuid="lxc1">
1343  *     <attributes CRM_meta_on_node="lxc1" CRM_meta_on_node_uuid="lxc1"
1344  *                 CRM_meta_stonith_action="off" crm_feature_set="3.0.12"/>
1345  *     <downed>
1346  *       <node id="lxc1"/>
1347  *     </downed>
1348  *  </pseudo_event>
1349  */
1350 #define XPATH_PSEUDO_FENCE "/" XML_GRAPH_TAG_PSEUDO_EVENT \
1351     "[@" XML_LRM_ATTR_TASK "='stonith']/" XML_GRAPH_TAG_DOWNED \
1352     "/" XML_CIB_TAG_NODE
1353 
1354 /*!
1355  * \internal
1356  * \brief Check a pseudo-action for Pacemaker Remote node side effects
1357  *
1358  * \param[in,out] xml  XML of pseudo-action to check
1359  */
1360 void
1361 remote_ra_process_pseudo(xmlNode *xml)
     /* [previous][next][first][last][top][bottom][index][help] */
1362 {
1363     xmlXPathObjectPtr search = xpath_search(xml, XPATH_PSEUDO_FENCE);
1364 
1365     if (numXpathResults(search) == 1) {
1366         xmlNode *result = getXpathResult(search, 0);
1367 
1368         /* Normally, we handle the necessary side effects of a guest node stop
1369          * action when reporting the remote agent's result. However, if the stop
1370          * is implied due to fencing, it will be a fencing pseudo-event, and
1371          * there won't be a result to report. Handle that case here.
1372          *
1373          * This will result in a duplicate call to remote_node_down() if the
1374          * guest stop was real instead of implied, but that shouldn't hurt.
1375          *
1376          * There is still one corner case that isn't handled: if a guest node
1377          * isn't running any resources when its host is fenced, it will appear
1378          * to be cleanly stopped, so there will be no pseudo-fence, and our
1379          * peer cache state will be incorrect unless and until the guest is
1380          * recovered.
1381          */
1382         if (result) {
1383             const char *remote = ID(result);
1384 
1385             if (remote) {
1386                 remote_node_down(remote, DOWN_ERASE_LRM);
1387             }
1388         }
1389     }
1390     freeXpathObject(search);
1391 }
1392 
1393 static void
1394 remote_ra_maintenance(lrm_state_t * lrm_state, gboolean maintenance)
     /* [previous][next][first][last][top][bottom][index][help] */
1395 {
1396     xmlNode *update, *state;
1397     int call_opt;
1398     crm_node_t *node;
1399 
1400     call_opt = crmd_cib_smart_opt();
1401     node = crm_remote_peer_get(lrm_state->node_name);
1402     CRM_CHECK(node != NULL, return);
1403     update = create_xml_node(NULL, XML_CIB_TAG_STATUS);
1404     state = create_node_state_update(node, node_update_none, update,
1405                                      __func__);
1406     crm_xml_add(state, XML_NODE_IS_MAINTENANCE, maintenance?"1":"0");
1407     if (controld_update_cib(XML_CIB_TAG_STATUS, update, call_opt,
1408                             NULL) == pcmk_rc_ok) {
1409         /* TODO: still not 100% sure that async update will succeed ... */
1410         if (maintenance) {
1411             lrm_remote_set_flags(lrm_state, remote_in_maint);
1412         } else {
1413             lrm_remote_clear_flags(lrm_state, remote_in_maint);
1414         }
1415     }
1416     free_xml(update);
1417 }
1418 
1419 #define XPATH_PSEUDO_MAINTENANCE "//" XML_GRAPH_TAG_PSEUDO_EVENT \
1420     "[@" XML_LRM_ATTR_TASK "='" PCMK_ACTION_MAINTENANCE_NODES "']/" \
1421     XML_GRAPH_TAG_MAINTENANCE
1422 
1423 /*!
1424  * \internal
1425  * \brief Check a pseudo-action holding updates for maintenance state
1426  *
1427  * \param[in,out] xml  XML of pseudo-action to check
1428  */
1429 void
1430 remote_ra_process_maintenance_nodes(xmlNode *xml)
     /* [previous][next][first][last][top][bottom][index][help] */
1431 {
1432     xmlXPathObjectPtr search = xpath_search(xml, XPATH_PSEUDO_MAINTENANCE);
1433 
1434     if (numXpathResults(search) == 1) {
1435         xmlNode *node;
1436         int cnt = 0, cnt_remote = 0;
1437 
1438         for (node = first_named_child(getXpathResult(search, 0),
1439                                       XML_CIB_TAG_NODE);
1440              node != NULL; node = crm_next_same_xml(node)) {
1441 
1442             lrm_state_t *lrm_state = lrm_state_find(ID(node));
1443 
1444             cnt++;
1445             if (lrm_state && lrm_state->remote_ra_data &&
1446                 pcmk_is_set(((remote_ra_data_t *) lrm_state->remote_ra_data)->status, remote_active)) {
1447                 int is_maint;
1448 
1449                 cnt_remote++;
1450                 pcmk__scan_min_int(crm_element_value(node, XML_NODE_IS_MAINTENANCE),
1451                                    &is_maint, 0);
1452                 remote_ra_maintenance(lrm_state, is_maint);
1453             }
1454         }
1455         crm_trace("Action holds %d nodes (%d remotes found) "
1456                     "adjusting maintenance-mode", cnt, cnt_remote);
1457     }
1458     freeXpathObject(search);
1459 }
1460 
1461 gboolean
1462 remote_ra_is_in_maintenance(lrm_state_t * lrm_state)
     /* [previous][next][first][last][top][bottom][index][help] */
1463 {
1464     remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
1465     return pcmk_is_set(ra_data->status, remote_in_maint);
1466 }
1467 
1468 gboolean
1469 remote_ra_controlling_guest(lrm_state_t * lrm_state)
     /* [previous][next][first][last][top][bottom][index][help] */
1470 {
1471     remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
1472     return pcmk_is_set(ra_data->status, controlling_guest);
1473 }

/* [previous][next][first][last][top][bottom][index][help] */