root/lib/pengine/unpack.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. is_dangling_guest_node
  2. pe_fence_node
  3. set_if_xpath
  4. unpack_config
  5. pe_create_node
  6. expand_remote_rsc_meta
  7. handle_startup_fencing
  8. unpack_nodes
  9. setup_container
  10. unpack_remote_nodes
  11. link_rsc2remotenode
  12. destroy_tag
  13. unpack_resources
  14. unpack_tags
  15. unpack_ticket_state
  16. unpack_tickets_state
  17. unpack_handle_remote_attrs
  18. unpack_transient_attributes
  19. unpack_node_state
  20. unpack_node_history
  21. unpack_status
  22. unpack_node_member
  23. unpack_node_online
  24. unpack_node_terminate
  25. determine_online_status_no_fencing
  26. pending_too_long
  27. determine_online_status_fencing
  28. determine_remote_online_status
  29. determine_online_status
  30. pe_base_name_end
  31. clone_strip
  32. clone_zero
  33. create_fake_resource
  34. create_anonymous_orphan
  35. find_anonymous_clone
  36. unpack_find_resource
  37. process_orphan_resource
  38. process_rsc_state
  39. process_recurring
  40. calculate_active_ops
  41. unpack_shutdown_lock
  42. unpack_lrm_resource
  43. handle_orphaned_container_fillers
  44. unpack_node_lrm
  45. set_active
  46. set_node_score
  47. find_lrm_op
  48. find_lrm_resource
  49. unknown_on_node
  50. monitor_not_running_after
  51. non_monitor_after
  52. newer_state_after_migrate
  53. get_migration_node_names
  54. add_dangling_migration
  55. unpack_migrate_to_success
  56. unpack_migrate_to_failure
  57. unpack_migrate_from_failure
  58. record_failed_op
  59. last_change_str
  60. cmp_on_fail
  61. ban_from_all_nodes
  62. unpack_failure_handling
  63. unpack_rsc_op_failure
  64. block_if_unrecoverable
  65. remap_because
  66. remap_operation
  67. should_clear_for_param_change
  68. order_after_remote_fencing
  69. should_ignore_failure_timeout
  70. check_operation_expiry
  71. pe__target_rc_from_xml
  72. update_resource_state
  73. can_affect_state
  74. unpack_action_result
  75. process_expired_result
  76. mask_probe_failure
  77. failure_is_newer
  78. process_pending_action
  79. unpack_rsc_op
  80. add_node_attrs
  81. extract_operations
  82. find_operations

   1 /*
   2  * Copyright 2004-2023 the Pacemaker project contributors
   3  *
   4  * The version control history for this file may have further details.
   5  *
   6  * This source code is licensed under the GNU Lesser General Public License
   7  * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY.
   8  */
   9 
  10 #include <crm_internal.h>
  11 
  12 #include <stdio.h>
  13 #include <string.h>
  14 #include <glib.h>
  15 #include <time.h>
  16 
  17 #include <crm/crm.h>
  18 #include <crm/services.h>
  19 #include <crm/msg_xml.h>
  20 #include <crm/common/xml.h>
  21 #include <crm/common/xml_internal.h>
  22 
  23 #include <crm/common/util.h>
  24 #include <crm/pengine/rules.h>
  25 #include <crm/pengine/internal.h>
  26 #include <pe_status_private.h>
  27 
  28 CRM_TRACE_INIT_DATA(pe_status);
  29 
  30 // A (parsed) resource action history entry
  31 struct action_history {
  32     pcmk_resource_t *rsc;       // Resource that history is for
  33     pcmk_node_t *node;        // Node that history is for
  34     xmlNode *xml;             // History entry XML
  35 
  36     // Parsed from entry XML
  37     const char *id;           // XML ID of history entry
  38     const char *key;          // Operation key of action
  39     const char *task;         // Action name
  40     const char *exit_reason;  // Exit reason given for result
  41     guint interval_ms;        // Action interval
  42     int call_id;              // Call ID of action
  43     int expected_exit_status; // Expected exit status of action
  44     int exit_status;          // Actual exit status of action
  45     int execution_status;     // Execution status of action
  46 };
  47 
  48 /* This uses pcmk__set_flags_as()/pcmk__clear_flags_as() directly rather than
  49  * use pe__set_working_set_flags()/pe__clear_working_set_flags() so that the
  50  * flag is stringified more readably in log messages.
  51  */
  52 #define set_config_flag(scheduler, option, flag) do {                         \
  53         const char *scf_value = pe_pref((scheduler)->config_hash, (option));  \
  54         if (scf_value != NULL) {                                              \
  55             if (crm_is_true(scf_value)) {                                     \
  56                 (scheduler)->flags = pcmk__set_flags_as(__func__, __LINE__,   \
  57                                     LOG_TRACE, "Scheduler",                   \
  58                                     crm_system_name, (scheduler)->flags,      \
  59                                     (flag), #flag);                           \
  60             } else {                                                          \
  61                 (scheduler)->flags = pcmk__clear_flags_as(__func__, __LINE__, \
  62                                     LOG_TRACE, "Scheduler",                   \
  63                                     crm_system_name, (scheduler)->flags,      \
  64                                     (flag), #flag);                           \
  65             }                                                                 \
  66         }                                                                     \
  67     } while(0)
  68 
  69 static void unpack_rsc_op(pcmk_resource_t *rsc, pcmk_node_t *node,
  70                           xmlNode *xml_op, xmlNode **last_failure,
  71                           enum action_fail_response *failed);
  72 static void determine_remote_online_status(pcmk_scheduler_t *scheduler,
  73                                            pcmk_node_t *this_node);
  74 static void add_node_attrs(const xmlNode *xml_obj, pcmk_node_t *node,
  75                            bool overwrite, pcmk_scheduler_t *scheduler);
  76 static void determine_online_status(const xmlNode *node_state,
  77                                     pcmk_node_t *this_node,
  78                                     pcmk_scheduler_t *scheduler);
  79 
  80 static void unpack_node_lrm(pcmk_node_t *node, const xmlNode *xml,
  81                             pcmk_scheduler_t *scheduler);
  82 
  83 
  84 static gboolean
  85 is_dangling_guest_node(pcmk_node_t *node)
     /* [previous][next][first][last][top][bottom][index][help] */
  86 {
  87     /* we are looking for a remote-node that was supposed to be mapped to a
  88      * container resource, but all traces of that container have disappeared 
  89      * from both the config and the status section. */
  90     if (pe__is_guest_or_remote_node(node) &&
  91         node->details->remote_rsc &&
  92         node->details->remote_rsc->container == NULL &&
  93         pcmk_is_set(node->details->remote_rsc->flags,
  94                     pcmk_rsc_removed_filler)) {
  95         return TRUE;
  96     }
  97 
  98     return FALSE;
  99 }
 100 
 101 /*!
 102  * \brief Schedule a fence action for a node
 103  *
 104  * \param[in,out] scheduler       Scheduler data
 105  * \param[in,out] node            Node to fence
 106  * \param[in]     reason          Text description of why fencing is needed
 107  * \param[in]     priority_delay  Whether to consider `priority-fencing-delay`
 108  */
 109 void
 110 pe_fence_node(pcmk_scheduler_t *scheduler, pcmk_node_t *node,
     /* [previous][next][first][last][top][bottom][index][help] */
 111               const char *reason, bool priority_delay)
 112 {
 113     CRM_CHECK(node, return);
 114 
 115     /* A guest node is fenced by marking its container as failed */
 116     if (pe__is_guest_node(node)) {
 117         pcmk_resource_t *rsc = node->details->remote_rsc->container;
 118 
 119         if (!pcmk_is_set(rsc->flags, pcmk_rsc_failed)) {
 120             if (!pcmk_is_set(rsc->flags, pcmk_rsc_managed)) {
 121                 crm_notice("Not fencing guest node %s "
 122                            "(otherwise would because %s): "
 123                            "its guest resource %s is unmanaged",
 124                            pe__node_name(node), reason, rsc->id);
 125             } else {
 126                 crm_warn("Guest node %s will be fenced "
 127                          "(by recovering its guest resource %s): %s",
 128                          pe__node_name(node), rsc->id, reason);
 129 
 130                 /* We don't mark the node as unclean because that would prevent the
 131                  * node from running resources. We want to allow it to run resources
 132                  * in this transition if the recovery succeeds.
 133                  */
 134                 node->details->remote_requires_reset = TRUE;
 135                 pe__set_resource_flags(rsc,
 136                                        pcmk_rsc_failed|pcmk_rsc_stop_if_failed);
 137             }
 138         }
 139 
 140     } else if (is_dangling_guest_node(node)) {
 141         crm_info("Cleaning up dangling connection for guest node %s: "
 142                  "fencing was already done because %s, "
 143                  "and guest resource no longer exists",
 144                  pe__node_name(node), reason);
 145         pe__set_resource_flags(node->details->remote_rsc,
 146                                pcmk_rsc_failed|pcmk_rsc_stop_if_failed);
 147 
 148     } else if (pe__is_remote_node(node)) {
 149         pcmk_resource_t *rsc = node->details->remote_rsc;
 150 
 151         if ((rsc != NULL) && !pcmk_is_set(rsc->flags, pcmk_rsc_managed)) {
 152             crm_notice("Not fencing remote node %s "
 153                        "(otherwise would because %s): connection is unmanaged",
 154                        pe__node_name(node), reason);
 155         } else if(node->details->remote_requires_reset == FALSE) {
 156             node->details->remote_requires_reset = TRUE;
 157             crm_warn("Remote node %s %s: %s",
 158                      pe__node_name(node),
 159                      pe_can_fence(scheduler, node)? "will be fenced" : "is unclean",
 160                      reason);
 161         }
 162         node->details->unclean = TRUE;
 163         // No need to apply `priority-fencing-delay` for remote nodes
 164         pe_fence_op(node, NULL, TRUE, reason, FALSE, scheduler);
 165 
 166     } else if (node->details->unclean) {
 167         crm_trace("Cluster node %s %s because %s",
 168                   pe__node_name(node),
 169                   pe_can_fence(scheduler, node)? "would also be fenced" : "also is unclean",
 170                   reason);
 171 
 172     } else {
 173         crm_warn("Cluster node %s %s: %s",
 174                  pe__node_name(node),
 175                  pe_can_fence(scheduler, node)? "will be fenced" : "is unclean",
 176                  reason);
 177         node->details->unclean = TRUE;
 178         pe_fence_op(node, NULL, TRUE, reason, priority_delay, scheduler);
 179     }
 180 }
 181 
 182 // @TODO xpaths can't handle templates, rules, or id-refs
 183 
 184 // nvpair with provides or requires set to unfencing
 185 #define XPATH_UNFENCING_NVPAIR XML_CIB_TAG_NVPAIR                \
 186     "[(@" XML_NVPAIR_ATTR_NAME "='" PCMK_STONITH_PROVIDES "'"    \
 187     "or @" XML_NVPAIR_ATTR_NAME "='" XML_RSC_ATTR_REQUIRES "') " \
 188     "and @" XML_NVPAIR_ATTR_VALUE "='" PCMK__VALUE_UNFENCING "']"
 189 
 190 // unfencing in rsc_defaults or any resource
 191 #define XPATH_ENABLE_UNFENCING \
 192     "/" XML_TAG_CIB "/" XML_CIB_TAG_CONFIGURATION "/" XML_CIB_TAG_RESOURCES   \
 193     "//" XML_TAG_META_SETS "/" XPATH_UNFENCING_NVPAIR                                               \
 194     "|/" XML_TAG_CIB "/" XML_CIB_TAG_CONFIGURATION "/" XML_CIB_TAG_RSCCONFIG  \
 195     "/" XML_TAG_META_SETS "/" XPATH_UNFENCING_NVPAIR
 196 
 197 static void
 198 set_if_xpath(uint64_t flag, const char *xpath, pcmk_scheduler_t *scheduler)
     /* [previous][next][first][last][top][bottom][index][help] */
 199 {
 200     xmlXPathObjectPtr result = NULL;
 201 
 202     if (!pcmk_is_set(scheduler->flags, flag)) {
 203         result = xpath_search(scheduler->input, xpath);
 204         if (result && (numXpathResults(result) > 0)) {
 205             pe__set_working_set_flags(scheduler, flag);
 206         }
 207         freeXpathObject(result);
 208     }
 209 }
 210 
 211 gboolean
 212 unpack_config(xmlNode *config, pcmk_scheduler_t *scheduler)
     /* [previous][next][first][last][top][bottom][index][help] */
 213 {
 214     const char *value = NULL;
 215     GHashTable *config_hash = pcmk__strkey_table(free, free);
 216 
 217     pe_rule_eval_data_t rule_data = {
 218         .node_hash = NULL,
 219         .role = pcmk_role_unknown,
 220         .now = scheduler->now,
 221         .match_data = NULL,
 222         .rsc_data = NULL,
 223         .op_data = NULL
 224     };
 225 
 226     scheduler->config_hash = config_hash;
 227 
 228     pe__unpack_dataset_nvpairs(config, XML_CIB_TAG_PROPSET, &rule_data, config_hash,
 229                                CIB_OPTIONS_FIRST, FALSE, scheduler);
 230 
 231     verify_pe_options(scheduler->config_hash);
 232 
 233     set_config_flag(scheduler, "enable-startup-probes",
 234                     pcmk_sched_probe_resources);
 235     if (!pcmk_is_set(scheduler->flags, pcmk_sched_probe_resources)) {
 236         crm_info("Startup probes: disabled (dangerous)");
 237     }
 238 
 239     value = pe_pref(scheduler->config_hash, XML_ATTR_HAVE_WATCHDOG);
 240     if (value && crm_is_true(value)) {
 241         crm_info("Watchdog-based self-fencing will be performed via SBD if "
 242                  "fencing is required and stonith-watchdog-timeout is nonzero");
 243         pe__set_working_set_flags(scheduler, pcmk_sched_have_fencing);
 244     }
 245 
 246     /* Set certain flags via xpath here, so they can be used before the relevant
 247      * configuration sections are unpacked.
 248      */
 249     set_if_xpath(pcmk_sched_enable_unfencing, XPATH_ENABLE_UNFENCING,
 250                  scheduler);
 251 
 252     value = pe_pref(scheduler->config_hash, "stonith-timeout");
 253     scheduler->stonith_timeout = (int) crm_parse_interval_spec(value);
 254     crm_debug("STONITH timeout: %d", scheduler->stonith_timeout);
 255 
 256     set_config_flag(scheduler, "stonith-enabled", pcmk_sched_fencing_enabled);
 257     if (pcmk_is_set(scheduler->flags, pcmk_sched_fencing_enabled)) {
 258         crm_debug("STONITH of failed nodes is enabled");
 259     } else {
 260         crm_debug("STONITH of failed nodes is disabled");
 261     }
 262 
 263     scheduler->stonith_action = pe_pref(scheduler->config_hash,
 264                                         "stonith-action");
 265     if (!strcmp(scheduler->stonith_action, "poweroff")) {
 266         pe_warn_once(pcmk__wo_poweroff,
 267                      "Support for stonith-action of 'poweroff' is deprecated "
 268                      "and will be removed in a future release (use 'off' instead)");
 269         scheduler->stonith_action = PCMK_ACTION_OFF;
 270     }
 271     crm_trace("STONITH will %s nodes", scheduler->stonith_action);
 272 
 273     set_config_flag(scheduler, "concurrent-fencing",
 274                     pcmk_sched_concurrent_fencing);
 275     if (pcmk_is_set(scheduler->flags, pcmk_sched_concurrent_fencing)) {
 276         crm_debug("Concurrent fencing is enabled");
 277     } else {
 278         crm_debug("Concurrent fencing is disabled");
 279     }
 280 
 281     value = pe_pref(scheduler->config_hash,
 282                     XML_CONFIG_ATTR_PRIORITY_FENCING_DELAY);
 283     if (value) {
 284         scheduler->priority_fencing_delay = crm_parse_interval_spec(value)
 285                                             / 1000;
 286         crm_trace("Priority fencing delay is %ds",
 287                   scheduler->priority_fencing_delay);
 288     }
 289 
 290     set_config_flag(scheduler, "stop-all-resources", pcmk_sched_stop_all);
 291     crm_debug("Stop all active resources: %s",
 292               pcmk__btoa(pcmk_is_set(scheduler->flags, pcmk_sched_stop_all)));
 293 
 294     set_config_flag(scheduler, "symmetric-cluster",
 295                     pcmk_sched_symmetric_cluster);
 296     if (pcmk_is_set(scheduler->flags, pcmk_sched_symmetric_cluster)) {
 297         crm_debug("Cluster is symmetric" " - resources can run anywhere by default");
 298     }
 299 
 300     value = pe_pref(scheduler->config_hash, "no-quorum-policy");
 301 
 302     if (pcmk__str_eq(value, "ignore", pcmk__str_casei)) {
 303         scheduler->no_quorum_policy = pcmk_no_quorum_ignore;
 304 
 305     } else if (pcmk__str_eq(value, "freeze", pcmk__str_casei)) {
 306         scheduler->no_quorum_policy = pcmk_no_quorum_freeze;
 307 
 308     } else if (pcmk__str_eq(value, "demote", pcmk__str_casei)) {
 309         scheduler->no_quorum_policy = pcmk_no_quorum_demote;
 310 
 311     } else if (pcmk__str_eq(value, "suicide", pcmk__str_casei)) {
 312         if (pcmk_is_set(scheduler->flags, pcmk_sched_fencing_enabled)) {
 313             int do_panic = 0;
 314 
 315             crm_element_value_int(scheduler->input, XML_ATTR_QUORUM_PANIC,
 316                                   &do_panic);
 317             if (do_panic || pcmk_is_set(scheduler->flags, pcmk_sched_quorate)) {
 318                 scheduler->no_quorum_policy = pcmk_no_quorum_fence;
 319             } else {
 320                 crm_notice("Resetting no-quorum-policy to 'stop': cluster has never had quorum");
 321                 scheduler->no_quorum_policy = pcmk_no_quorum_stop;
 322             }
 323         } else {
 324             pcmk__config_err("Resetting no-quorum-policy to 'stop' because "
 325                              "fencing is disabled");
 326             scheduler->no_quorum_policy = pcmk_no_quorum_stop;
 327         }
 328 
 329     } else {
 330         scheduler->no_quorum_policy = pcmk_no_quorum_stop;
 331     }
 332 
 333     switch (scheduler->no_quorum_policy) {
 334         case pcmk_no_quorum_freeze:
 335             crm_debug("On loss of quorum: Freeze resources");
 336             break;
 337         case pcmk_no_quorum_stop:
 338             crm_debug("On loss of quorum: Stop ALL resources");
 339             break;
 340         case pcmk_no_quorum_demote:
 341             crm_debug("On loss of quorum: "
 342                       "Demote promotable resources and stop other resources");
 343             break;
 344         case pcmk_no_quorum_fence:
 345             crm_notice("On loss of quorum: Fence all remaining nodes");
 346             break;
 347         case pcmk_no_quorum_ignore:
 348             crm_notice("On loss of quorum: Ignore");
 349             break;
 350     }
 351 
 352     set_config_flag(scheduler, "stop-orphan-resources",
 353                     pcmk_sched_stop_removed_resources);
 354     if (pcmk_is_set(scheduler->flags, pcmk_sched_stop_removed_resources)) {
 355         crm_trace("Orphan resources are stopped");
 356     } else {
 357         crm_trace("Orphan resources are ignored");
 358     }
 359 
 360     set_config_flag(scheduler, "stop-orphan-actions",
 361                     pcmk_sched_cancel_removed_actions);
 362     if (pcmk_is_set(scheduler->flags, pcmk_sched_cancel_removed_actions)) {
 363         crm_trace("Orphan resource actions are stopped");
 364     } else {
 365         crm_trace("Orphan resource actions are ignored");
 366     }
 367 
 368     value = pe_pref(scheduler->config_hash, "remove-after-stop");
 369     if (value != NULL) {
 370         if (crm_is_true(value)) {
 371             pe__set_working_set_flags(scheduler, pcmk_sched_remove_after_stop);
 372 #ifndef PCMK__COMPAT_2_0
 373             pe_warn_once(pcmk__wo_remove_after,
 374                          "Support for the remove-after-stop cluster property is"
 375                          " deprecated and will be removed in a future release");
 376 #endif
 377         } else {
 378             pe__clear_working_set_flags(scheduler,
 379                                         pcmk_sched_remove_after_stop);
 380         }
 381     }
 382 
 383     set_config_flag(scheduler, "maintenance-mode", pcmk_sched_in_maintenance);
 384     crm_trace("Maintenance mode: %s",
 385               pcmk__btoa(pcmk_is_set(scheduler->flags,
 386                                      pcmk_sched_in_maintenance)));
 387 
 388     set_config_flag(scheduler, "start-failure-is-fatal",
 389                     pcmk_sched_start_failure_fatal);
 390     if (pcmk_is_set(scheduler->flags, pcmk_sched_start_failure_fatal)) {
 391         crm_trace("Start failures are always fatal");
 392     } else {
 393         crm_trace("Start failures are handled by failcount");
 394     }
 395 
 396     if (pcmk_is_set(scheduler->flags, pcmk_sched_fencing_enabled)) {
 397         set_config_flag(scheduler, "startup-fencing",
 398                         pcmk_sched_startup_fencing);
 399     }
 400     if (pcmk_is_set(scheduler->flags, pcmk_sched_startup_fencing)) {
 401         crm_trace("Unseen nodes will be fenced");
 402     } else {
 403         pe_warn_once(pcmk__wo_blind, "Blind faith: not fencing unseen nodes");
 404     }
 405 
 406     pe__unpack_node_health_scores(scheduler);
 407 
 408     scheduler->placement_strategy = pe_pref(scheduler->config_hash,
 409                                             "placement-strategy");
 410     crm_trace("Placement strategy: %s", scheduler->placement_strategy);
 411 
 412     set_config_flag(scheduler, "shutdown-lock", pcmk_sched_shutdown_lock);
 413     if (pcmk_is_set(scheduler->flags, pcmk_sched_shutdown_lock)) {
 414         value = pe_pref(scheduler->config_hash,
 415                         XML_CONFIG_ATTR_SHUTDOWN_LOCK_LIMIT);
 416         scheduler->shutdown_lock = crm_parse_interval_spec(value) / 1000;
 417         crm_trace("Resources will be locked to nodes that were cleanly "
 418                   "shut down (locks expire after %s)",
 419                   pcmk__readable_interval(scheduler->shutdown_lock));
 420     } else {
 421         crm_trace("Resources will not be locked to nodes that were cleanly "
 422                   "shut down");
 423     }
 424 
 425     value = pe_pref(scheduler->config_hash,
 426                     XML_CONFIG_ATTR_NODE_PENDING_TIMEOUT);
 427     scheduler->node_pending_timeout = crm_parse_interval_spec(value) / 1000;
 428     if (scheduler->node_pending_timeout == 0) {
 429         crm_trace("Do not fence pending nodes");
 430     } else {
 431         crm_trace("Fence pending nodes after %s",
 432                   pcmk__readable_interval(scheduler->node_pending_timeout
 433                                           * 1000));
 434     }
 435 
 436     return TRUE;
 437 }
 438 
 439 pcmk_node_t *
 440 pe_create_node(const char *id, const char *uname, const char *type,
     /* [previous][next][first][last][top][bottom][index][help] */
 441                const char *score, pcmk_scheduler_t *scheduler)
 442 {
 443     pcmk_node_t *new_node = NULL;
 444 
 445     if (pe_find_node(scheduler->nodes, uname) != NULL) {
 446         pcmk__config_warn("More than one node entry has name '%s'", uname);
 447     }
 448 
 449     new_node = calloc(1, sizeof(pcmk_node_t));
 450     if (new_node == NULL) {
 451         return NULL;
 452     }
 453 
 454     new_node->weight = char2score(score);
 455     new_node->details = calloc(1, sizeof(struct pe_node_shared_s));
 456 
 457     if (new_node->details == NULL) {
 458         free(new_node);
 459         return NULL;
 460     }
 461 
 462     crm_trace("Creating node for entry %s/%s", uname, id);
 463     new_node->details->id = id;
 464     new_node->details->uname = uname;
 465     new_node->details->online = FALSE;
 466     new_node->details->shutdown = FALSE;
 467     new_node->details->rsc_discovery_enabled = TRUE;
 468     new_node->details->running_rsc = NULL;
 469     new_node->details->data_set = scheduler;
 470 
 471     if (pcmk__str_eq(type, "member", pcmk__str_null_matches | pcmk__str_casei)) {
 472         new_node->details->type = pcmk_node_variant_cluster;
 473 
 474     } else if (pcmk__str_eq(type, "remote", pcmk__str_casei)) {
 475         new_node->details->type = pcmk_node_variant_remote;
 476         pe__set_working_set_flags(scheduler, pcmk_sched_have_remote_nodes);
 477 
 478     } else {
 479         /* @COMPAT 'ping' is the default for backward compatibility, but it
 480          * should be changed to 'member' at a compatibility break
 481          */
 482         if (!pcmk__str_eq(type, "ping", pcmk__str_casei)) {
 483             pcmk__config_warn("Node %s has unrecognized type '%s', "
 484                               "assuming 'ping'", pcmk__s(uname, "without name"),
 485                               type);
 486         }
 487         pe_warn_once(pcmk__wo_ping_node,
 488                      "Support for nodes of type 'ping' (such as %s) is "
 489                      "deprecated and will be removed in a future release",
 490                      pcmk__s(uname, "unnamed node"));
 491         new_node->details->type = node_ping;
 492     }
 493 
 494     new_node->details->attrs = pcmk__strkey_table(free, free);
 495 
 496     if (pe__is_guest_or_remote_node(new_node)) {
 497         g_hash_table_insert(new_node->details->attrs, strdup(CRM_ATTR_KIND),
 498                             strdup("remote"));
 499     } else {
 500         g_hash_table_insert(new_node->details->attrs, strdup(CRM_ATTR_KIND),
 501                             strdup("cluster"));
 502     }
 503 
 504     new_node->details->utilization = pcmk__strkey_table(free, free);
 505     new_node->details->digest_cache = pcmk__strkey_table(free,
 506                                                           pe__free_digests);
 507 
 508     scheduler->nodes = g_list_insert_sorted(scheduler->nodes, new_node,
 509                                             pe__cmp_node_name);
 510     return new_node;
 511 }
 512 
 513 static const char *
 514 expand_remote_rsc_meta(xmlNode *xml_obj, xmlNode *parent, pcmk_scheduler_t *data)
     /* [previous][next][first][last][top][bottom][index][help] */
 515 {
 516     xmlNode *attr_set = NULL;
 517     xmlNode *attr = NULL;
 518 
 519     const char *container_id = ID(xml_obj);
 520     const char *remote_name = NULL;
 521     const char *remote_server = NULL;
 522     const char *remote_port = NULL;
 523     const char *connect_timeout = "60s";
 524     const char *remote_allow_migrate=NULL;
 525     const char *is_managed = NULL;
 526 
 527     for (attr_set = pcmk__xe_first_child(xml_obj); attr_set != NULL;
 528          attr_set = pcmk__xe_next(attr_set)) {
 529 
 530         if (!pcmk__str_eq((const char *)attr_set->name, XML_TAG_META_SETS,
 531                           pcmk__str_casei)) {
 532             continue;
 533         }
 534 
 535         for (attr = pcmk__xe_first_child(attr_set); attr != NULL;
 536              attr = pcmk__xe_next(attr)) {
 537             const char *value = crm_element_value(attr, XML_NVPAIR_ATTR_VALUE);
 538             const char *name = crm_element_value(attr, XML_NVPAIR_ATTR_NAME);
 539 
 540             if (pcmk__str_eq(name, XML_RSC_ATTR_REMOTE_NODE, pcmk__str_casei)) {
 541                 remote_name = value;
 542             } else if (pcmk__str_eq(name, "remote-addr", pcmk__str_casei)) {
 543                 remote_server = value;
 544             } else if (pcmk__str_eq(name, "remote-port", pcmk__str_casei)) {
 545                 remote_port = value;
 546             } else if (pcmk__str_eq(name, "remote-connect-timeout", pcmk__str_casei)) {
 547                 connect_timeout = value;
 548             } else if (pcmk__str_eq(name, "remote-allow-migrate", pcmk__str_casei)) {
 549                 remote_allow_migrate=value;
 550             } else if (pcmk__str_eq(name, XML_RSC_ATTR_MANAGED, pcmk__str_casei)) {
 551                 is_managed = value;
 552             }
 553         }
 554     }
 555 
 556     if (remote_name == NULL) {
 557         return NULL;
 558     }
 559 
 560     if (pe_find_resource(data->resources, remote_name) != NULL) {
 561         return NULL;
 562     }
 563 
 564     pe_create_remote_xml(parent, remote_name, container_id,
 565                          remote_allow_migrate, is_managed,
 566                          connect_timeout, remote_server, remote_port);
 567     return remote_name;
 568 }
 569 
 570 static void
 571 handle_startup_fencing(pcmk_scheduler_t *scheduler, pcmk_node_t *new_node)
     /* [previous][next][first][last][top][bottom][index][help] */
 572 {
 573     if ((new_node->details->type == pcmk_node_variant_remote)
 574         && (new_node->details->remote_rsc == NULL)) {
 575         /* Ignore fencing for remote nodes that don't have a connection resource
 576          * associated with them. This happens when remote node entries get left
 577          * in the nodes section after the connection resource is removed.
 578          */
 579         return;
 580     }
 581 
 582     if (pcmk_is_set(scheduler->flags, pcmk_sched_startup_fencing)) {
 583         // All nodes are unclean until we've seen their status entry
 584         new_node->details->unclean = TRUE;
 585 
 586     } else {
 587         // Blind faith ...
 588         new_node->details->unclean = FALSE;
 589     }
 590 
 591     /* We need to be able to determine if a node's status section
 592      * exists or not separate from whether the node is unclean. */
 593     new_node->details->unseen = TRUE;
 594 }
 595 
 596 gboolean
 597 unpack_nodes(xmlNode *xml_nodes, pcmk_scheduler_t *scheduler)
     /* [previous][next][first][last][top][bottom][index][help] */
 598 {
 599     xmlNode *xml_obj = NULL;
 600     pcmk_node_t *new_node = NULL;
 601     const char *id = NULL;
 602     const char *uname = NULL;
 603     const char *type = NULL;
 604     const char *score = NULL;
 605 
 606     for (xml_obj = pcmk__xe_first_child(xml_nodes); xml_obj != NULL;
 607          xml_obj = pcmk__xe_next(xml_obj)) {
 608 
 609         if (pcmk__str_eq((const char *)xml_obj->name, XML_CIB_TAG_NODE, pcmk__str_none)) {
 610             new_node = NULL;
 611 
 612             id = crm_element_value(xml_obj, XML_ATTR_ID);
 613             uname = crm_element_value(xml_obj, XML_ATTR_UNAME);
 614             type = crm_element_value(xml_obj, XML_ATTR_TYPE);
 615             score = crm_element_value(xml_obj, XML_RULE_ATTR_SCORE);
 616             crm_trace("Processing node %s/%s", uname, id);
 617 
 618             if (id == NULL) {
 619                 pcmk__config_err("Ignoring <" XML_CIB_TAG_NODE
 620                                  "> entry in configuration without id");
 621                 continue;
 622             }
 623             new_node = pe_create_node(id, uname, type, score, scheduler);
 624 
 625             if (new_node == NULL) {
 626                 return FALSE;
 627             }
 628 
 629             handle_startup_fencing(scheduler, new_node);
 630 
 631             add_node_attrs(xml_obj, new_node, FALSE, scheduler);
 632 
 633             crm_trace("Done with node %s", crm_element_value(xml_obj, XML_ATTR_UNAME));
 634         }
 635     }
 636 
 637     if (scheduler->localhost
 638         && (pe_find_node(scheduler->nodes, scheduler->localhost) == NULL)) {
 639         crm_info("Creating a fake local node");
 640         pe_create_node(scheduler->localhost, scheduler->localhost, NULL, 0,
 641                        scheduler);
 642     }
 643 
 644     return TRUE;
 645 }
 646 
 647 static void
 648 setup_container(pcmk_resource_t *rsc, pcmk_scheduler_t *scheduler)
     /* [previous][next][first][last][top][bottom][index][help] */
 649 {
 650     const char *container_id = NULL;
 651 
 652     if (rsc->children) {
 653         g_list_foreach(rsc->children, (GFunc) setup_container, scheduler);
 654         return;
 655     }
 656 
 657     container_id = g_hash_table_lookup(rsc->meta, XML_RSC_ATTR_CONTAINER);
 658     if (container_id && !pcmk__str_eq(container_id, rsc->id, pcmk__str_casei)) {
 659         pcmk_resource_t *container = pe_find_resource(scheduler->resources,
 660                                                       container_id);
 661 
 662         if (container) {
 663             rsc->container = container;
 664             pe__set_resource_flags(container, pcmk_rsc_has_filler);
 665             container->fillers = g_list_append(container->fillers, rsc);
 666             pe_rsc_trace(rsc, "Resource %s's container is %s", rsc->id, container_id);
 667         } else {
 668             pe_err("Resource %s: Unknown resource container (%s)", rsc->id, container_id);
 669         }
 670     }
 671 }
 672 
 673 gboolean
 674 unpack_remote_nodes(xmlNode *xml_resources, pcmk_scheduler_t *scheduler)
     /* [previous][next][first][last][top][bottom][index][help] */
 675 {
 676     xmlNode *xml_obj = NULL;
 677 
 678     /* Create remote nodes and guest nodes from the resource configuration
 679      * before unpacking resources.
 680      */
 681     for (xml_obj = pcmk__xe_first_child(xml_resources); xml_obj != NULL;
 682          xml_obj = pcmk__xe_next(xml_obj)) {
 683 
 684         const char *new_node_id = NULL;
 685 
 686         /* Check for remote nodes, which are defined by ocf:pacemaker:remote
 687          * primitives.
 688          */
 689         if (xml_contains_remote_node(xml_obj)) {
 690             new_node_id = ID(xml_obj);
 691             /* The "pe_find_node" check is here to make sure we don't iterate over
 692              * an expanded node that has already been added to the node list. */
 693             if (new_node_id
 694                 && (pe_find_node(scheduler->nodes, new_node_id) == NULL)) {
 695                 crm_trace("Found remote node %s defined by resource %s",
 696                           new_node_id, ID(xml_obj));
 697                 pe_create_node(new_node_id, new_node_id, "remote", NULL,
 698                                scheduler);
 699             }
 700             continue;
 701         }
 702 
 703         /* Check for guest nodes, which are defined by special meta-attributes
 704          * of a primitive of any type (for example, VirtualDomain or Xen).
 705          */
 706         if (pcmk__str_eq((const char *)xml_obj->name, XML_CIB_TAG_RESOURCE, pcmk__str_none)) {
 707             /* This will add an ocf:pacemaker:remote primitive to the
 708              * configuration for the guest node's connection, to be unpacked
 709              * later.
 710              */
 711             new_node_id = expand_remote_rsc_meta(xml_obj, xml_resources,
 712                                                  scheduler);
 713             if (new_node_id
 714                 && (pe_find_node(scheduler->nodes, new_node_id) == NULL)) {
 715                 crm_trace("Found guest node %s in resource %s",
 716                           new_node_id, ID(xml_obj));
 717                 pe_create_node(new_node_id, new_node_id, "remote", NULL,
 718                                scheduler);
 719             }
 720             continue;
 721         }
 722 
 723         /* Check for guest nodes inside a group. Clones are currently not
 724          * supported as guest nodes.
 725          */
 726         if (pcmk__str_eq((const char *)xml_obj->name, XML_CIB_TAG_GROUP, pcmk__str_none)) {
 727             xmlNode *xml_obj2 = NULL;
 728             for (xml_obj2 = pcmk__xe_first_child(xml_obj); xml_obj2 != NULL;
 729                  xml_obj2 = pcmk__xe_next(xml_obj2)) {
 730 
 731                 new_node_id = expand_remote_rsc_meta(xml_obj2, xml_resources,
 732                                                      scheduler);
 733 
 734                 if (new_node_id
 735                     && (pe_find_node(scheduler->nodes, new_node_id) == NULL)) {
 736                     crm_trace("Found guest node %s in resource %s inside group %s",
 737                               new_node_id, ID(xml_obj2), ID(xml_obj));
 738                     pe_create_node(new_node_id, new_node_id, "remote", NULL,
 739                                    scheduler);
 740                 }
 741             }
 742         }
 743     }
 744     return TRUE;
 745 }
 746 
 747 /* Call this after all the nodes and resources have been
 748  * unpacked, but before the status section is read.
 749  *
 750  * A remote node's online status is reflected by the state
 751  * of the remote node's connection resource. We need to link
 752  * the remote node to this connection resource so we can have
 753  * easy access to the connection resource during the scheduler calculations.
 754  */
 755 static void
 756 link_rsc2remotenode(pcmk_scheduler_t *scheduler, pcmk_resource_t *new_rsc)
     /* [previous][next][first][last][top][bottom][index][help] */
 757 {
 758     pcmk_node_t *remote_node = NULL;
 759 
 760     if (new_rsc->is_remote_node == FALSE) {
 761         return;
 762     }
 763 
 764     if (pcmk_is_set(scheduler->flags, pcmk_sched_location_only)) {
 765         /* remote_nodes and remote_resources are not linked in quick location calculations */
 766         return;
 767     }
 768 
 769     remote_node = pe_find_node(scheduler->nodes, new_rsc->id);
 770     CRM_CHECK(remote_node != NULL, return);
 771 
 772     pe_rsc_trace(new_rsc, "Linking remote connection resource %s to %s",
 773                  new_rsc->id, pe__node_name(remote_node));
 774     remote_node->details->remote_rsc = new_rsc;
 775 
 776     if (new_rsc->container == NULL) {
 777         /* Handle start-up fencing for remote nodes (as opposed to guest nodes)
 778          * the same as is done for cluster nodes.
 779          */
 780         handle_startup_fencing(scheduler, remote_node);
 781 
 782     } else {
 783         /* pe_create_node() marks the new node as "remote" or "cluster"; now
 784          * that we know the node is a guest node, update it correctly.
 785          */
 786         g_hash_table_replace(remote_node->details->attrs, strdup(CRM_ATTR_KIND),
 787                              strdup("container"));
 788     }
 789 }
 790 
 791 static void
 792 destroy_tag(gpointer data)
     /* [previous][next][first][last][top][bottom][index][help] */
 793 {
 794     pcmk_tag_t *tag = data;
 795 
 796     if (tag) {
 797         free(tag->id);
 798         g_list_free_full(tag->refs, free);
 799         free(tag);
 800     }
 801 }
 802 
 803 /*!
 804  * \internal
 805  * \brief Parse configuration XML for resource information
 806  *
 807  * \param[in]     xml_resources  Top of resource configuration XML
 808  * \param[in,out] scheduler      Scheduler data
 809  *
 810  * \return TRUE
 811  *
 812  * \note unpack_remote_nodes() MUST be called before this, so that the nodes can
 813  *       be used when pe__unpack_resource() calls resource_location()
 814  */
 815 gboolean
 816 unpack_resources(const xmlNode *xml_resources, pcmk_scheduler_t *scheduler)
     /* [previous][next][first][last][top][bottom][index][help] */
 817 {
 818     xmlNode *xml_obj = NULL;
 819     GList *gIter = NULL;
 820 
 821     scheduler->template_rsc_sets = pcmk__strkey_table(free, destroy_tag);
 822 
 823     for (xml_obj = pcmk__xe_first_child(xml_resources); xml_obj != NULL;
 824          xml_obj = pcmk__xe_next(xml_obj)) {
 825 
 826         pcmk_resource_t *new_rsc = NULL;
 827         const char *id = ID(xml_obj);
 828 
 829         if (pcmk__str_empty(id)) {
 830             pcmk__config_err("Ignoring <%s> resource without ID",
 831                              xml_obj->name);
 832             continue;
 833         }
 834 
 835         if (pcmk__str_eq((const char *) xml_obj->name, XML_CIB_TAG_RSC_TEMPLATE,
 836                          pcmk__str_none)) {
 837             if (g_hash_table_lookup_extended(scheduler->template_rsc_sets, id,
 838                                              NULL, NULL) == FALSE) {
 839                 /* Record the template's ID for the knowledge of its existence anyway. */
 840                 g_hash_table_insert(scheduler->template_rsc_sets, strdup(id),
 841                                     NULL);
 842             }
 843             continue;
 844         }
 845 
 846         crm_trace("Unpacking <%s " XML_ATTR_ID "='%s'>",
 847                   xml_obj->name, id);
 848         if (pe__unpack_resource(xml_obj, &new_rsc, NULL,
 849                                 scheduler) == pcmk_rc_ok) {
 850             scheduler->resources = g_list_append(scheduler->resources, new_rsc);
 851             pe_rsc_trace(new_rsc, "Added resource %s", new_rsc->id);
 852 
 853         } else {
 854             pcmk__config_err("Ignoring <%s> resource '%s' "
 855                              "because configuration is invalid",
 856                              xml_obj->name, id);
 857         }
 858     }
 859 
 860     for (gIter = scheduler->resources; gIter != NULL; gIter = gIter->next) {
 861         pcmk_resource_t *rsc = (pcmk_resource_t *) gIter->data;
 862 
 863         setup_container(rsc, scheduler);
 864         link_rsc2remotenode(scheduler, rsc);
 865     }
 866 
 867     scheduler->resources = g_list_sort(scheduler->resources,
 868                                       pe__cmp_rsc_priority);
 869     if (pcmk_is_set(scheduler->flags, pcmk_sched_location_only)) {
 870         /* Ignore */
 871 
 872     } else if (pcmk_is_set(scheduler->flags, pcmk_sched_fencing_enabled)
 873                && !pcmk_is_set(scheduler->flags, pcmk_sched_have_fencing)) {
 874 
 875         pcmk__config_err("Resource start-up disabled since no STONITH resources have been defined");
 876         pcmk__config_err("Either configure some or disable STONITH with the stonith-enabled option");
 877         pcmk__config_err("NOTE: Clusters with shared data need STONITH to ensure data integrity");
 878     }
 879 
 880     return TRUE;
 881 }
 882 
 883 gboolean
 884 unpack_tags(xmlNode *xml_tags, pcmk_scheduler_t *scheduler)
     /* [previous][next][first][last][top][bottom][index][help] */
 885 {
 886     xmlNode *xml_tag = NULL;
 887 
 888     scheduler->tags = pcmk__strkey_table(free, destroy_tag);
 889 
 890     for (xml_tag = pcmk__xe_first_child(xml_tags); xml_tag != NULL;
 891          xml_tag = pcmk__xe_next(xml_tag)) {
 892 
 893         xmlNode *xml_obj_ref = NULL;
 894         const char *tag_id = ID(xml_tag);
 895 
 896         if (!pcmk__str_eq((const char *)xml_tag->name, XML_CIB_TAG_TAG, pcmk__str_none)) {
 897             continue;
 898         }
 899 
 900         if (tag_id == NULL) {
 901             pcmk__config_err("Ignoring <%s> without " XML_ATTR_ID,
 902                              (const char *) xml_tag->name);
 903             continue;
 904         }
 905 
 906         for (xml_obj_ref = pcmk__xe_first_child(xml_tag); xml_obj_ref != NULL;
 907              xml_obj_ref = pcmk__xe_next(xml_obj_ref)) {
 908 
 909             const char *obj_ref = ID(xml_obj_ref);
 910 
 911             if (!pcmk__str_eq((const char *)xml_obj_ref->name, XML_CIB_TAG_OBJ_REF, pcmk__str_none)) {
 912                 continue;
 913             }
 914 
 915             if (obj_ref == NULL) {
 916                 pcmk__config_err("Ignoring <%s> for tag '%s' without " XML_ATTR_ID,
 917                                  xml_obj_ref->name, tag_id);
 918                 continue;
 919             }
 920 
 921             if (add_tag_ref(scheduler->tags, tag_id, obj_ref) == FALSE) {
 922                 return FALSE;
 923             }
 924         }
 925     }
 926 
 927     return TRUE;
 928 }
 929 
 930 /* The ticket state section:
 931  * "/cib/status/tickets/ticket_state" */
 932 static gboolean
 933 unpack_ticket_state(xmlNode *xml_ticket, pcmk_scheduler_t *scheduler)
     /* [previous][next][first][last][top][bottom][index][help] */
 934 {
 935     const char *ticket_id = NULL;
 936     const char *granted = NULL;
 937     const char *last_granted = NULL;
 938     const char *standby = NULL;
 939     xmlAttrPtr xIter = NULL;
 940 
 941     pcmk_ticket_t *ticket = NULL;
 942 
 943     ticket_id = ID(xml_ticket);
 944     if (pcmk__str_empty(ticket_id)) {
 945         return FALSE;
 946     }
 947 
 948     crm_trace("Processing ticket state for %s", ticket_id);
 949 
 950     ticket = g_hash_table_lookup(scheduler->tickets, ticket_id);
 951     if (ticket == NULL) {
 952         ticket = ticket_new(ticket_id, scheduler);
 953         if (ticket == NULL) {
 954             return FALSE;
 955         }
 956     }
 957 
 958     for (xIter = xml_ticket->properties; xIter; xIter = xIter->next) {
 959         const char *prop_name = (const char *)xIter->name;
 960         const char *prop_value = pcmk__xml_attr_value(xIter);
 961 
 962         if (pcmk__str_eq(prop_name, XML_ATTR_ID, pcmk__str_none)) {
 963             continue;
 964         }
 965         g_hash_table_replace(ticket->state, strdup(prop_name), strdup(prop_value));
 966     }
 967 
 968     granted = g_hash_table_lookup(ticket->state, "granted");
 969     if (granted && crm_is_true(granted)) {
 970         ticket->granted = TRUE;
 971         crm_info("We have ticket '%s'", ticket->id);
 972     } else {
 973         ticket->granted = FALSE;
 974         crm_info("We do not have ticket '%s'", ticket->id);
 975     }
 976 
 977     last_granted = g_hash_table_lookup(ticket->state, "last-granted");
 978     if (last_granted) {
 979         long long last_granted_ll;
 980 
 981         pcmk__scan_ll(last_granted, &last_granted_ll, 0LL);
 982         ticket->last_granted = (time_t) last_granted_ll;
 983     }
 984 
 985     standby = g_hash_table_lookup(ticket->state, "standby");
 986     if (standby && crm_is_true(standby)) {
 987         ticket->standby = TRUE;
 988         if (ticket->granted) {
 989             crm_info("Granted ticket '%s' is in standby-mode", ticket->id);
 990         }
 991     } else {
 992         ticket->standby = FALSE;
 993     }
 994 
 995     crm_trace("Done with ticket state for %s", ticket_id);
 996 
 997     return TRUE;
 998 }
 999 
1000 static gboolean
1001 unpack_tickets_state(xmlNode *xml_tickets, pcmk_scheduler_t *scheduler)
     /* [previous][next][first][last][top][bottom][index][help] */
1002 {
1003     xmlNode *xml_obj = NULL;
1004 
1005     for (xml_obj = pcmk__xe_first_child(xml_tickets); xml_obj != NULL;
1006          xml_obj = pcmk__xe_next(xml_obj)) {
1007 
1008         if (!pcmk__str_eq((const char *)xml_obj->name, XML_CIB_TAG_TICKET_STATE, pcmk__str_none)) {
1009             continue;
1010         }
1011         unpack_ticket_state(xml_obj, scheduler);
1012     }
1013 
1014     return TRUE;
1015 }
1016 
1017 static void
1018 unpack_handle_remote_attrs(pcmk_node_t *this_node, const xmlNode *state,
     /* [previous][next][first][last][top][bottom][index][help] */
1019                            pcmk_scheduler_t *scheduler)
1020 {
1021     const char *resource_discovery_enabled = NULL;
1022     const xmlNode *attrs = NULL;
1023     pcmk_resource_t *rsc = NULL;
1024 
1025     if (!pcmk__str_eq((const char *)state->name, XML_CIB_TAG_STATE, pcmk__str_none)) {
1026         return;
1027     }
1028 
1029     if ((this_node == NULL) || !pe__is_guest_or_remote_node(this_node)) {
1030         return;
1031     }
1032     crm_trace("Processing Pacemaker Remote node %s", pe__node_name(this_node));
1033 
1034     pcmk__scan_min_int(crm_element_value(state, XML_NODE_IS_MAINTENANCE),
1035                        &(this_node->details->remote_maintenance), 0);
1036 
1037     rsc = this_node->details->remote_rsc;
1038     if (this_node->details->remote_requires_reset == FALSE) {
1039         this_node->details->unclean = FALSE;
1040         this_node->details->unseen = FALSE;
1041     }
1042     attrs = find_xml_node(state, XML_TAG_TRANSIENT_NODEATTRS, FALSE);
1043     add_node_attrs(attrs, this_node, TRUE, scheduler);
1044 
1045     if (pe__shutdown_requested(this_node)) {
1046         crm_info("%s is shutting down", pe__node_name(this_node));
1047         this_node->details->shutdown = TRUE;
1048     }
1049  
1050     if (crm_is_true(pe_node_attribute_raw(this_node, "standby"))) {
1051         crm_info("%s is in standby mode", pe__node_name(this_node));
1052         this_node->details->standby = TRUE;
1053     }
1054 
1055     if (crm_is_true(pe_node_attribute_raw(this_node, "maintenance")) ||
1056         ((rsc != NULL) && !pcmk_is_set(rsc->flags, pcmk_rsc_managed))) {
1057         crm_info("%s is in maintenance mode", pe__node_name(this_node));
1058         this_node->details->maintenance = TRUE;
1059     }
1060 
1061     resource_discovery_enabled = pe_node_attribute_raw(this_node, XML_NODE_ATTR_RSC_DISCOVERY);
1062     if (resource_discovery_enabled && !crm_is_true(resource_discovery_enabled)) {
1063         if (pe__is_remote_node(this_node)
1064             && !pcmk_is_set(scheduler->flags, pcmk_sched_fencing_enabled)) {
1065             crm_warn("Ignoring " XML_NODE_ATTR_RSC_DISCOVERY
1066                      " attribute on Pacemaker Remote node %s"
1067                      " because fencing is disabled",
1068                      pe__node_name(this_node));
1069         } else {
1070             /* This is either a remote node with fencing enabled, or a guest
1071              * node. We don't care whether fencing is enabled when fencing guest
1072              * nodes, because they are "fenced" by recovering their containing
1073              * resource.
1074              */
1075             crm_info("%s has resource discovery disabled",
1076                      pe__node_name(this_node));
1077             this_node->details->rsc_discovery_enabled = FALSE;
1078         }
1079     }
1080 }
1081 
1082 /*!
1083  * \internal
1084  * \brief Unpack a cluster node's transient attributes
1085  *
1086  * \param[in]     state      CIB node state XML
1087  * \param[in,out] node       Cluster node whose attributes are being unpacked
1088  * \param[in,out] scheduler  Scheduler data
1089  */
1090 static void
1091 unpack_transient_attributes(const xmlNode *state, pcmk_node_t *node,
     /* [previous][next][first][last][top][bottom][index][help] */
1092                             pcmk_scheduler_t *scheduler)
1093 {
1094     const char *discovery = NULL;
1095     const xmlNode *attrs = find_xml_node(state, XML_TAG_TRANSIENT_NODEATTRS,
1096                                          FALSE);
1097 
1098     add_node_attrs(attrs, node, TRUE, scheduler);
1099 
1100     if (crm_is_true(pe_node_attribute_raw(node, "standby"))) {
1101         crm_info("%s is in standby mode", pe__node_name(node));
1102         node->details->standby = TRUE;
1103     }
1104 
1105     if (crm_is_true(pe_node_attribute_raw(node, "maintenance"))) {
1106         crm_info("%s is in maintenance mode", pe__node_name(node));
1107         node->details->maintenance = TRUE;
1108     }
1109 
1110     discovery = pe_node_attribute_raw(node, XML_NODE_ATTR_RSC_DISCOVERY);
1111     if ((discovery != NULL) && !crm_is_true(discovery)) {
1112         crm_warn("Ignoring " XML_NODE_ATTR_RSC_DISCOVERY
1113                  " attribute for %s because disabling resource discovery "
1114                  "is not allowed for cluster nodes", pe__node_name(node));
1115     }
1116 }
1117 
1118 /*!
1119  * \internal
1120  * \brief Unpack a node state entry (first pass)
1121  *
1122  * Unpack one node state entry from status. This unpacks information from the
1123  * node_state element itself and node attributes inside it, but not the
1124  * resource history inside it. Multiple passes through the status are needed to
1125  * fully unpack everything.
1126  *
1127  * \param[in]     state      CIB node state XML
1128  * \param[in,out] scheduler  Scheduler data
1129  */
1130 static void
1131 unpack_node_state(const xmlNode *state, pcmk_scheduler_t *scheduler)
     /* [previous][next][first][last][top][bottom][index][help] */
1132 {
1133     const char *id = NULL;
1134     const char *uname = NULL;
1135     pcmk_node_t *this_node = NULL;
1136 
1137     id = crm_element_value(state, XML_ATTR_ID);
1138     if (id == NULL) {
1139         crm_warn("Ignoring malformed " XML_CIB_TAG_STATE " entry without "
1140                  XML_ATTR_ID);
1141         return;
1142     }
1143 
1144     uname = crm_element_value(state, XML_ATTR_UNAME);
1145     if (uname == NULL) {
1146         /* If a joining peer makes the cluster acquire the quorum from corosync
1147          * meanwhile it has not joined CPG membership of pacemaker-controld yet,
1148          * it's possible that the created node_state entry doesn't have an uname
1149          * yet. We should recognize the node as `pending` and wait for it to
1150          * join CPG.
1151          */
1152         crm_trace("Handling " XML_CIB_TAG_STATE " entry with id=\"%s\" without "
1153                   XML_ATTR_UNAME, id);
1154     }
1155 
1156     this_node = pe_find_node_any(scheduler->nodes, id, uname);
1157     if (this_node == NULL) {
1158         pcmk__config_warn("Ignoring recorded node state for id=\"%s\" (%s) "
1159                           "because it is no longer in the configuration",
1160                           id, pcmk__s(uname, "uname unknown"));
1161         return;
1162     }
1163 
1164     if (pe__is_guest_or_remote_node(this_node)) {
1165         /* We can't determine the online status of Pacemaker Remote nodes until
1166          * after all resource history has been unpacked. In this first pass, we
1167          * do need to mark whether the node has been fenced, as this plays a
1168          * role during unpacking cluster node resource state.
1169          */
1170         pcmk__scan_min_int(crm_element_value(state, XML_NODE_IS_FENCED),
1171                            &(this_node->details->remote_was_fenced), 0);
1172         return;
1173     }
1174 
1175     unpack_transient_attributes(state, this_node, scheduler);
1176 
1177     /* Provisionally mark this cluster node as clean. We have at least seen it
1178      * in the current cluster's lifetime.
1179      */
1180     this_node->details->unclean = FALSE;
1181     this_node->details->unseen = FALSE;
1182 
1183     crm_trace("Determining online status of cluster node %s (id %s)",
1184               pe__node_name(this_node), id);
1185     determine_online_status(state, this_node, scheduler);
1186 
1187     if (!pcmk_is_set(scheduler->flags, pcmk_sched_quorate)
1188         && this_node->details->online
1189         && (scheduler->no_quorum_policy == pcmk_no_quorum_fence)) {
1190         /* Everything else should flow from this automatically
1191          * (at least until the scheduler becomes able to migrate off
1192          * healthy resources)
1193          */
1194         pe_fence_node(scheduler, this_node, "cluster does not have quorum",
1195                       FALSE);
1196     }
1197 }
1198 
1199 /*!
1200  * \internal
1201  * \brief Unpack nodes' resource history as much as possible
1202  *
1203  * Unpack as many nodes' resource history as possible in one pass through the
1204  * status. We need to process Pacemaker Remote nodes' connections/containers
1205  * before unpacking their history; the connection/container history will be
1206  * in another node's history, so it might take multiple passes to unpack
1207  * everything.
1208  *
1209  * \param[in]     status     CIB XML status section
1210  * \param[in]     fence      If true, treat any not-yet-unpacked nodes as unseen
1211  * \param[in,out] scheduler  Scheduler data
1212  *
1213  * \return Standard Pacemaker return code (specifically pcmk_rc_ok if done,
1214  *         or EAGAIN if more unpacking remains to be done)
1215  */
1216 static int
1217 unpack_node_history(const xmlNode *status, bool fence,
     /* [previous][next][first][last][top][bottom][index][help] */
1218                     pcmk_scheduler_t *scheduler)
1219 {
1220     int rc = pcmk_rc_ok;
1221 
1222     // Loop through all node_state entries in CIB status
1223     for (const xmlNode *state = first_named_child(status, XML_CIB_TAG_STATE);
1224          state != NULL; state = crm_next_same_xml(state)) {
1225 
1226         const char *id = ID(state);
1227         const char *uname = crm_element_value(state, XML_ATTR_UNAME);
1228         pcmk_node_t *this_node = NULL;
1229 
1230         if ((id == NULL) || (uname == NULL)) {
1231             // Warning already logged in first pass through status section
1232             crm_trace("Not unpacking resource history from malformed "
1233                       XML_CIB_TAG_STATE " without id and/or uname");
1234             continue;
1235         }
1236 
1237         this_node = pe_find_node_any(scheduler->nodes, id, uname);
1238         if (this_node == NULL) {
1239             // Warning already logged in first pass through status section
1240             crm_trace("Not unpacking resource history for node %s because "
1241                       "no longer in configuration", id);
1242             continue;
1243         }
1244 
1245         if (this_node->details->unpacked) {
1246             crm_trace("Not unpacking resource history for node %s because "
1247                       "already unpacked", id);
1248             continue;
1249         }
1250 
1251         if (fence) {
1252             // We're processing all remaining nodes
1253 
1254         } else if (pe__is_guest_node(this_node)) {
1255             /* We can unpack a guest node's history only after we've unpacked
1256              * other resource history to the point that we know that the node's
1257              * connection and containing resource are both up.
1258              */
1259             pcmk_resource_t *rsc = this_node->details->remote_rsc;
1260 
1261             if ((rsc == NULL) || (rsc->role != pcmk_role_started)
1262                 || (rsc->container->role != pcmk_role_started)) {
1263                 crm_trace("Not unpacking resource history for guest node %s "
1264                           "because container and connection are not known to "
1265                           "be up", id);
1266                 continue;
1267             }
1268 
1269         } else if (pe__is_remote_node(this_node)) {
1270             /* We can unpack a remote node's history only after we've unpacked
1271              * other resource history to the point that we know that the node's
1272              * connection is up, with the exception of when shutdown locks are
1273              * in use.
1274              */
1275             pcmk_resource_t *rsc = this_node->details->remote_rsc;
1276 
1277             if ((rsc == NULL)
1278                 || (!pcmk_is_set(scheduler->flags, pcmk_sched_shutdown_lock)
1279                     && (rsc->role != pcmk_role_started))) {
1280                 crm_trace("Not unpacking resource history for remote node %s "
1281                           "because connection is not known to be up", id);
1282                 continue;
1283             }
1284 
1285         /* If fencing and shutdown locks are disabled and we're not processing
1286          * unseen nodes, then we don't want to unpack offline nodes until online
1287          * nodes have been unpacked. This allows us to number active clone
1288          * instances first.
1289          */
1290         } else if (!pcmk_any_flags_set(scheduler->flags,
1291                                        pcmk_sched_fencing_enabled
1292                                        |pcmk_sched_shutdown_lock)
1293                    && !this_node->details->online) {
1294             crm_trace("Not unpacking resource history for offline "
1295                       "cluster node %s", id);
1296             continue;
1297         }
1298 
1299         if (pe__is_guest_or_remote_node(this_node)) {
1300             determine_remote_online_status(scheduler, this_node);
1301             unpack_handle_remote_attrs(this_node, state, scheduler);
1302         }
1303 
1304         crm_trace("Unpacking resource history for %snode %s",
1305                   (fence? "unseen " : ""), id);
1306 
1307         this_node->details->unpacked = TRUE;
1308         unpack_node_lrm(this_node, state, scheduler);
1309 
1310         rc = EAGAIN; // Other node histories might depend on this one
1311     }
1312     return rc;
1313 }
1314 
1315 /* remove nodes that are down, stopping */
1316 /* create positive rsc_to_node constraints between resources and the nodes they are running on */
1317 /* anything else? */
1318 gboolean
1319 unpack_status(xmlNode *status, pcmk_scheduler_t *scheduler)
     /* [previous][next][first][last][top][bottom][index][help] */
1320 {
1321     xmlNode *state = NULL;
1322 
1323     crm_trace("Beginning unpack");
1324 
1325     if (scheduler->tickets == NULL) {
1326         scheduler->tickets = pcmk__strkey_table(free, destroy_ticket);
1327     }
1328 
1329     for (state = pcmk__xe_first_child(status); state != NULL;
1330          state = pcmk__xe_next(state)) {
1331 
1332         if (pcmk__str_eq((const char *)state->name, XML_CIB_TAG_TICKETS, pcmk__str_none)) {
1333             unpack_tickets_state((xmlNode *) state, scheduler);
1334 
1335         } else if (pcmk__str_eq((const char *)state->name, XML_CIB_TAG_STATE, pcmk__str_none)) {
1336             unpack_node_state(state, scheduler);
1337         }
1338     }
1339 
1340     while (unpack_node_history(status, FALSE, scheduler) == EAGAIN) {
1341         crm_trace("Another pass through node resource histories is needed");
1342     }
1343 
1344     // Now catch any nodes we didn't see
1345     unpack_node_history(status,
1346                         pcmk_is_set(scheduler->flags,
1347                                     pcmk_sched_fencing_enabled),
1348                         scheduler);
1349 
1350     /* Now that we know where resources are, we can schedule stops of containers
1351      * with failed bundle connections
1352      */
1353     if (scheduler->stop_needed != NULL) {
1354         for (GList *item = scheduler->stop_needed; item; item = item->next) {
1355             pcmk_resource_t *container = item->data;
1356             pcmk_node_t *node = pe__current_node(container);
1357 
1358             if (node) {
1359                 stop_action(container, node, FALSE);
1360             }
1361         }
1362         g_list_free(scheduler->stop_needed);
1363         scheduler->stop_needed = NULL;
1364     }
1365 
1366     /* Now that we know status of all Pacemaker Remote connections and nodes,
1367      * we can stop connections for node shutdowns, and check the online status
1368      * of remote/guest nodes that didn't have any node history to unpack.
1369      */
1370     for (GList *gIter = scheduler->nodes; gIter != NULL; gIter = gIter->next) {
1371         pcmk_node_t *this_node = gIter->data;
1372 
1373         if (!pe__is_guest_or_remote_node(this_node)) {
1374             continue;
1375         }
1376         if (this_node->details->shutdown
1377             && (this_node->details->remote_rsc != NULL)) {
1378             pe__set_next_role(this_node->details->remote_rsc, pcmk_role_stopped,
1379                               "remote shutdown");
1380         }
1381         if (!this_node->details->unpacked) {
1382             determine_remote_online_status(scheduler, this_node);
1383         }
1384     }
1385 
1386     return TRUE;
1387 }
1388 
1389 /*!
1390  * \internal
1391  * \brief Unpack node's time when it became a member at the cluster layer
1392  *
1393  * \param[in]     node_state  Node's node_state entry
1394  * \param[in,out] scheduler   Scheduler data
1395  *
1396  * \return Epoch time when node became a cluster member
1397  *         (or scheduler effective time for legacy entries) if a member,
1398  *         0 if not a member, or -1 if no valid information available
1399  */
1400 static long long
1401 unpack_node_member(const xmlNode *node_state, pcmk_scheduler_t *scheduler)
     /* [previous][next][first][last][top][bottom][index][help] */
1402 {
1403     const char *member_time = crm_element_value(node_state, PCMK__XA_IN_CCM);
1404     int member = 0;
1405 
1406     if (member_time == NULL) {
1407         return -1LL;
1408 
1409     } else if (crm_str_to_boolean(member_time, &member) == 1) {
1410         /* If in_ccm=0, we'll return 0 here. If in_ccm=1, either the entry was
1411          * recorded as a boolean for a DC < 2.1.7, or the node is pending
1412          * shutdown and has left the CPG, in which case it was set to 1 to avoid
1413          * fencing for node-pending-timeout.
1414          *
1415          * We return the effective time for in_ccm=1 because what's important to
1416          * avoid fencing is that effective time minus this value is less than
1417          * the pending node timeout.
1418          */
1419         return member? (long long) get_effective_time(scheduler) : 0LL;
1420 
1421     } else {
1422         long long when_member = 0LL;
1423 
1424         if ((pcmk__scan_ll(member_time, &when_member,
1425                            0LL) != pcmk_rc_ok) || (when_member < 0LL)) {
1426             crm_warn("Unrecognized value '%s' for " PCMK__XA_IN_CCM
1427                      " in " XML_CIB_TAG_STATE " entry", member_time);
1428             return -1LL;
1429         }
1430         return when_member;
1431     }
1432 }
1433 
1434 /*!
1435  * \internal
1436  * \brief Unpack node's time when it became online in process group
1437  *
1438  * \param[in] node_state  Node's node_state entry
1439  *
1440  * \return Epoch time when node became online in process group (or 0 if not
1441  *         online, or 1 for legacy online entries)
1442  */
1443 static long long
1444 unpack_node_online(const xmlNode *node_state)
     /* [previous][next][first][last][top][bottom][index][help] */
1445 {
1446     const char *peer_time = crm_element_value(node_state, PCMK__XA_CRMD);
1447 
1448     // @COMPAT Entries recorded for DCs < 2.1.7 have "online" or "offline"
1449     if (pcmk__str_eq(peer_time, OFFLINESTATUS,
1450                      pcmk__str_casei|pcmk__str_null_matches)) {
1451         return 0LL;
1452 
1453     } else if (pcmk__str_eq(peer_time, ONLINESTATUS, pcmk__str_casei)) {
1454         return 1LL;
1455 
1456     } else {
1457         long long when_online = 0LL;
1458 
1459         if ((pcmk__scan_ll(peer_time, &when_online, 0LL) != pcmk_rc_ok)
1460             || (when_online < 0)) {
1461             crm_warn("Unrecognized value '%s' for " PCMK__XA_CRMD " in "
1462                      XML_CIB_TAG_STATE " entry, assuming offline", peer_time);
1463             return 0LL;
1464         }
1465         return when_online;
1466     }
1467 }
1468 
1469 /*!
1470  * \internal
1471  * \brief Unpack node attribute for user-requested fencing
1472  *
1473  * \param[in] node        Node to check
1474  * \param[in] node_state  Node's node_state entry in CIB status
1475  *
1476  * \return \c true if fencing has been requested for \p node, otherwise \c false
1477  */
1478 static bool
1479 unpack_node_terminate(const pcmk_node_t *node, const xmlNode *node_state)
     /* [previous][next][first][last][top][bottom][index][help] */
1480 {
1481     long long value = 0LL;
1482     int value_i = 0;
1483     const char *value_s = pe_node_attribute_raw(node, PCMK_NODE_ATTR_TERMINATE);
1484 
1485     // Value may be boolean or an epoch time
1486     if (crm_str_to_boolean(value_s, &value_i) == 1) {
1487         return (value_i != 0);
1488     }
1489     if (pcmk__scan_ll(value_s, &value, 0LL) == pcmk_rc_ok) {
1490         return (value > 0);
1491     }
1492     crm_warn("Ignoring unrecognized value '%s' for " PCMK_NODE_ATTR_TERMINATE
1493              "node attribute for %s", value_s, pe__node_name(node));
1494     return false;
1495 }
1496 
1497 static gboolean
1498 determine_online_status_no_fencing(pcmk_scheduler_t *scheduler,
     /* [previous][next][first][last][top][bottom][index][help] */
1499                                    const xmlNode *node_state,
1500                                    pcmk_node_t *this_node)
1501 {
1502     gboolean online = FALSE;
1503     const char *join = crm_element_value(node_state, PCMK__XA_JOIN);
1504     const char *exp_state = crm_element_value(node_state, PCMK__XA_EXPECTED);
1505     long long when_member = unpack_node_member(node_state, scheduler);
1506     long long when_online = unpack_node_online(node_state);
1507 
1508     if (when_member <= 0) {
1509         crm_trace("Node %s is %sdown", pe__node_name(this_node),
1510                   ((when_member < 0)? "presumed " : ""));
1511 
1512     } else if (when_online > 0) {
1513         if (pcmk__str_eq(join, CRMD_JOINSTATE_MEMBER, pcmk__str_casei)) {
1514             online = TRUE;
1515         } else {
1516             crm_debug("Node %s is not ready to run resources: %s",
1517                       pe__node_name(this_node), join);
1518         }
1519 
1520     } else if (this_node->details->expected_up == FALSE) {
1521         crm_trace("Node %s controller is down: "
1522                   "member@%lld online@%lld join=%s expected=%s",
1523                   pe__node_name(this_node), when_member, when_online,
1524                   pcmk__s(join, "<null>"), pcmk__s(exp_state, "<null>"));
1525 
1526     } else {
1527         /* mark it unclean */
1528         pe_fence_node(scheduler, this_node, "peer is unexpectedly down", FALSE);
1529         crm_info("Node %s member@%lld online@%lld join=%s expected=%s",
1530                  pe__node_name(this_node), when_member, when_online,
1531                  pcmk__s(join, "<null>"), pcmk__s(exp_state, "<null>"));
1532     }
1533     return online;
1534 }
1535 
1536 /*!
1537  * \internal
1538  * \brief Check whether a node has taken too long to join controller group
1539  *
1540  * \param[in,out] scheduler    Scheduler data
1541  * \param[in]     node         Node to check
1542  * \param[in]     when_member  Epoch time when node became a cluster member
1543  * \param[in]     when_online  Epoch time when node joined controller group
1544  *
1545  * \return true if node has been pending (on the way up) longer than
1546  *         node-pending-timeout, otherwise false
1547  * \note This will also update the cluster's recheck time if appropriate.
1548  */
1549 static inline bool
1550 pending_too_long(pcmk_scheduler_t *scheduler, const pcmk_node_t *node,
     /* [previous][next][first][last][top][bottom][index][help] */
1551                  long long when_member, long long when_online)
1552 {
1553     if ((scheduler->node_pending_timeout > 0)
1554         && (when_member > 0) && (when_online <= 0)) {
1555         // There is a timeout on pending nodes, and node is pending
1556 
1557         time_t timeout = when_member + scheduler->node_pending_timeout;
1558 
1559         if (get_effective_time(node->details->data_set) >= timeout) {
1560             return true; // Node has timed out
1561         }
1562 
1563         // Node is pending, but still has time
1564         pe__update_recheck_time(timeout, scheduler, "pending node timeout");
1565     }
1566     return false;
1567 }
1568 
1569 static bool
1570 determine_online_status_fencing(pcmk_scheduler_t *scheduler,
     /* [previous][next][first][last][top][bottom][index][help] */
1571                                 const xmlNode *node_state,
1572                                 pcmk_node_t *this_node)
1573 {
1574     bool termination_requested = unpack_node_terminate(this_node, node_state);
1575     const char *join = crm_element_value(node_state, PCMK__XA_JOIN);
1576     const char *exp_state = crm_element_value(node_state, PCMK__XA_EXPECTED);
1577     long long when_member = unpack_node_member(node_state, scheduler);
1578     long long when_online = unpack_node_online(node_state);
1579 
1580 /*
1581   - PCMK__XA_JOIN          ::= member|down|pending|banned
1582   - PCMK__XA_EXPECTED      ::= member|down
1583 
1584   @COMPAT with entries recorded for DCs < 2.1.7
1585   - PCMK__XA_IN_CCM        ::= true|false
1586   - PCMK__XA_CRMD          ::= online|offline
1587 
1588   Since crm_feature_set 3.18.0 (pacemaker-2.1.7):
1589   - PCMK__XA_IN_CCM        ::= <timestamp>|0
1590   Since when node has been a cluster member. A value 0 of means the node is not
1591   a cluster member.
1592 
1593   - PCMK__XA_CRMD          ::= <timestamp>|0
1594   Since when peer has been online in CPG. A value 0 means the peer is offline
1595   in CPG.
1596 */
1597 
1598     crm_trace("Node %s member@%lld online@%lld join=%s expected=%s%s",
1599               pe__node_name(this_node), when_member, when_online,
1600               pcmk__s(join, "<null>"), pcmk__s(exp_state, "<null>"),
1601               (termination_requested? " (termination requested)" : ""));
1602 
1603     if (this_node->details->shutdown) {
1604         crm_debug("%s is shutting down", pe__node_name(this_node));
1605 
1606         /* Slightly different criteria since we can't shut down a dead peer */
1607         return (when_online > 0);
1608     }
1609 
1610     if (when_member < 0) {
1611         pe_fence_node(scheduler, this_node,
1612                       "peer has not been seen by the cluster", FALSE);
1613         return false;
1614     }
1615 
1616     if (pcmk__str_eq(join, CRMD_JOINSTATE_NACK, pcmk__str_none)) {
1617         pe_fence_node(scheduler, this_node,
1618                       "peer failed Pacemaker membership criteria", FALSE);
1619 
1620     } else if (termination_requested) {
1621         if ((when_member <= 0) && (when_online <= 0)
1622             && pcmk__str_eq(join, CRMD_JOINSTATE_DOWN, pcmk__str_none)) {
1623             crm_info("%s was fenced as requested", pe__node_name(this_node));
1624             return false;
1625         }
1626         pe_fence_node(scheduler, this_node, "fencing was requested", false);
1627 
1628     } else if (pcmk__str_eq(exp_state, CRMD_JOINSTATE_DOWN,
1629                             pcmk__str_null_matches)) {
1630 
1631         if (pending_too_long(scheduler, this_node, when_member, when_online)) {
1632             pe_fence_node(scheduler, this_node,
1633                           "peer pending timed out on joining the process group",
1634                           FALSE);
1635 
1636         } else if ((when_member > 0) || (when_online > 0)) {
1637             crm_info("- %s is not ready to run resources",
1638                      pe__node_name(this_node));
1639             this_node->details->standby = TRUE;
1640             this_node->details->pending = TRUE;
1641 
1642         } else {
1643             crm_trace("%s is down or still coming up",
1644                       pe__node_name(this_node));
1645         }
1646 
1647     } else if (when_member <= 0) {
1648         // Consider `priority-fencing-delay` for lost nodes
1649         pe_fence_node(scheduler, this_node,
1650                       "peer is no longer part of the cluster", TRUE);
1651 
1652     } else if (when_online <= 0) {
1653         pe_fence_node(scheduler, this_node,
1654                       "peer process is no longer available", FALSE);
1655 
1656         /* Everything is running at this point, now check join state */
1657 
1658     } else if (pcmk__str_eq(join, CRMD_JOINSTATE_MEMBER, pcmk__str_none)) {
1659         crm_info("%s is active", pe__node_name(this_node));
1660 
1661     } else if (pcmk__str_any_of(join, CRMD_JOINSTATE_PENDING,
1662                                 CRMD_JOINSTATE_DOWN, NULL)) {
1663         crm_info("%s is not ready to run resources", pe__node_name(this_node));
1664         this_node->details->standby = TRUE;
1665         this_node->details->pending = TRUE;
1666 
1667     } else {
1668         pe_fence_node(scheduler, this_node, "peer was in an unknown state",
1669                       FALSE);
1670     }
1671 
1672     return (when_member > 0);
1673 }
1674 
1675 static void
1676 determine_remote_online_status(pcmk_scheduler_t *scheduler,
     /* [previous][next][first][last][top][bottom][index][help] */
1677                                pcmk_node_t *this_node)
1678 {
1679     pcmk_resource_t *rsc = this_node->details->remote_rsc;
1680     pcmk_resource_t *container = NULL;
1681     pcmk_node_t *host = NULL;
1682 
1683     /* If there is a node state entry for a (former) Pacemaker Remote node
1684      * but no resource creating that node, the node's connection resource will
1685      * be NULL. Consider it an offline remote node in that case.
1686      */
1687     if (rsc == NULL) {
1688         this_node->details->online = FALSE;
1689         goto remote_online_done;
1690     }
1691 
1692     container = rsc->container;
1693 
1694     if (container && pcmk__list_of_1(rsc->running_on)) {
1695         host = rsc->running_on->data;
1696     }
1697 
1698     /* If the resource is currently started, mark it online. */
1699     if (rsc->role == pcmk_role_started) {
1700         crm_trace("%s node %s presumed ONLINE because connection resource is started",
1701                   (container? "Guest" : "Remote"), this_node->details->id);
1702         this_node->details->online = TRUE;
1703     }
1704 
1705     /* consider this node shutting down if transitioning start->stop */
1706     if ((rsc->role == pcmk_role_started)
1707         && (rsc->next_role == pcmk_role_stopped)) {
1708 
1709         crm_trace("%s node %s shutting down because connection resource is stopping",
1710                   (container? "Guest" : "Remote"), this_node->details->id);
1711         this_node->details->shutdown = TRUE;
1712     }
1713 
1714     /* Now check all the failure conditions. */
1715     if(container && pcmk_is_set(container->flags, pcmk_rsc_failed)) {
1716         crm_trace("Guest node %s UNCLEAN because guest resource failed",
1717                   this_node->details->id);
1718         this_node->details->online = FALSE;
1719         this_node->details->remote_requires_reset = TRUE;
1720 
1721     } else if (pcmk_is_set(rsc->flags, pcmk_rsc_failed)) {
1722         crm_trace("%s node %s OFFLINE because connection resource failed",
1723                   (container? "Guest" : "Remote"), this_node->details->id);
1724         this_node->details->online = FALSE;
1725 
1726     } else if ((rsc->role == pcmk_role_stopped)
1727                || ((container != NULL)
1728                    && (container->role == pcmk_role_stopped))) {
1729 
1730         crm_trace("%s node %s OFFLINE because its resource is stopped",
1731                   (container? "Guest" : "Remote"), this_node->details->id);
1732         this_node->details->online = FALSE;
1733         this_node->details->remote_requires_reset = FALSE;
1734 
1735     } else if (host && (host->details->online == FALSE)
1736                && host->details->unclean) {
1737         crm_trace("Guest node %s UNCLEAN because host is unclean",
1738                   this_node->details->id);
1739         this_node->details->online = FALSE;
1740         this_node->details->remote_requires_reset = TRUE;
1741     }
1742 
1743 remote_online_done:
1744     crm_trace("Remote node %s online=%s",
1745         this_node->details->id, this_node->details->online ? "TRUE" : "FALSE");
1746 }
1747 
1748 static void
1749 determine_online_status(const xmlNode *node_state, pcmk_node_t *this_node,
     /* [previous][next][first][last][top][bottom][index][help] */
1750                         pcmk_scheduler_t *scheduler)
1751 {
1752     gboolean online = FALSE;
1753     const char *exp_state = crm_element_value(node_state, PCMK__XA_EXPECTED);
1754 
1755     CRM_CHECK(this_node != NULL, return);
1756 
1757     this_node->details->shutdown = FALSE;
1758     this_node->details->expected_up = FALSE;
1759 
1760     if (pe__shutdown_requested(this_node)) {
1761         this_node->details->shutdown = TRUE;
1762 
1763     } else if (pcmk__str_eq(exp_state, CRMD_JOINSTATE_MEMBER, pcmk__str_casei)) {
1764         this_node->details->expected_up = TRUE;
1765     }
1766 
1767     if (this_node->details->type == node_ping) {
1768         this_node->details->unclean = FALSE;
1769         online = FALSE;         /* As far as resource management is concerned,
1770                                  * the node is safely offline.
1771                                  * Anyone caught abusing this logic will be shot
1772                                  */
1773 
1774     } else if (!pcmk_is_set(scheduler->flags, pcmk_sched_fencing_enabled)) {
1775         online = determine_online_status_no_fencing(scheduler, node_state,
1776                                                     this_node);
1777 
1778     } else {
1779         online = determine_online_status_fencing(scheduler, node_state,
1780                                                  this_node);
1781     }
1782 
1783     if (online) {
1784         this_node->details->online = TRUE;
1785 
1786     } else {
1787         /* remove node from contention */
1788         this_node->fixed = TRUE; // @COMPAT deprecated and unused
1789         this_node->weight = -INFINITY;
1790     }
1791 
1792     if (online && this_node->details->shutdown) {
1793         /* don't run resources here */
1794         this_node->fixed = TRUE; // @COMPAT deprecated and unused
1795         this_node->weight = -INFINITY;
1796     }
1797 
1798     if (this_node->details->type == node_ping) {
1799         crm_info("%s is not a Pacemaker node", pe__node_name(this_node));
1800 
1801     } else if (this_node->details->unclean) {
1802         pe_proc_warn("%s is unclean", pe__node_name(this_node));
1803 
1804     } else if (this_node->details->online) {
1805         crm_info("%s is %s", pe__node_name(this_node),
1806                  this_node->details->shutdown ? "shutting down" :
1807                  this_node->details->pending ? "pending" :
1808                  this_node->details->standby ? "standby" :
1809                  this_node->details->maintenance ? "maintenance" : "online");
1810 
1811     } else {
1812         crm_trace("%s is offline", pe__node_name(this_node));
1813     }
1814 }
1815 
1816 /*!
1817  * \internal
1818  * \brief Find the end of a resource's name, excluding any clone suffix
1819  *
1820  * \param[in] id  Resource ID to check
1821  *
1822  * \return Pointer to last character of resource's base name
1823  */
1824 const char *
1825 pe_base_name_end(const char *id)
     /* [previous][next][first][last][top][bottom][index][help] */
1826 {
1827     if (!pcmk__str_empty(id)) {
1828         const char *end = id + strlen(id) - 1;
1829 
1830         for (const char *s = end; s > id; --s) {
1831             switch (*s) {
1832                 case '0':
1833                 case '1':
1834                 case '2':
1835                 case '3':
1836                 case '4':
1837                 case '5':
1838                 case '6':
1839                 case '7':
1840                 case '8':
1841                 case '9':
1842                     break;
1843                 case ':':
1844                     return (s == end)? s : (s - 1);
1845                 default:
1846                     return end;
1847             }
1848         }
1849         return end;
1850     }
1851     return NULL;
1852 }
1853 
1854 /*!
1855  * \internal
1856  * \brief Get a resource name excluding any clone suffix
1857  *
1858  * \param[in] last_rsc_id  Resource ID to check
1859  *
1860  * \return Pointer to newly allocated string with resource's base name
1861  * \note It is the caller's responsibility to free() the result.
1862  *       This asserts on error, so callers can assume result is not NULL.
1863  */
1864 char *
1865 clone_strip(const char *last_rsc_id)
     /* [previous][next][first][last][top][bottom][index][help] */
1866 {
1867     const char *end = pe_base_name_end(last_rsc_id);
1868     char *basename = NULL;
1869 
1870     CRM_ASSERT(end);
1871     basename = strndup(last_rsc_id, end - last_rsc_id + 1);
1872     CRM_ASSERT(basename);
1873     return basename;
1874 }
1875 
1876 /*!
1877  * \internal
1878  * \brief Get the name of the first instance of a cloned resource
1879  *
1880  * \param[in] last_rsc_id  Resource ID to check
1881  *
1882  * \return Pointer to newly allocated string with resource's base name plus :0
1883  * \note It is the caller's responsibility to free() the result.
1884  *       This asserts on error, so callers can assume result is not NULL.
1885  */
1886 char *
1887 clone_zero(const char *last_rsc_id)
     /* [previous][next][first][last][top][bottom][index][help] */
1888 {
1889     const char *end = pe_base_name_end(last_rsc_id);
1890     size_t base_name_len = end - last_rsc_id + 1;
1891     char *zero = NULL;
1892 
1893     CRM_ASSERT(end);
1894     zero = calloc(base_name_len + 3, sizeof(char));
1895     CRM_ASSERT(zero);
1896     memcpy(zero, last_rsc_id, base_name_len);
1897     zero[base_name_len] = ':';
1898     zero[base_name_len + 1] = '0';
1899     return zero;
1900 }
1901 
1902 static pcmk_resource_t *
1903 create_fake_resource(const char *rsc_id, const xmlNode *rsc_entry,
     /* [previous][next][first][last][top][bottom][index][help] */
1904                      pcmk_scheduler_t *scheduler)
1905 {
1906     pcmk_resource_t *rsc = NULL;
1907     xmlNode *xml_rsc = create_xml_node(NULL, XML_CIB_TAG_RESOURCE);
1908 
1909     copy_in_properties(xml_rsc, rsc_entry);
1910     crm_xml_add(xml_rsc, XML_ATTR_ID, rsc_id);
1911     crm_log_xml_debug(xml_rsc, "Orphan resource");
1912 
1913     if (pe__unpack_resource(xml_rsc, &rsc, NULL, scheduler) != pcmk_rc_ok) {
1914         return NULL;
1915     }
1916 
1917     if (xml_contains_remote_node(xml_rsc)) {
1918         pcmk_node_t *node;
1919 
1920         crm_debug("Detected orphaned remote node %s", rsc_id);
1921         node = pe_find_node(scheduler->nodes, rsc_id);
1922         if (node == NULL) {
1923                 node = pe_create_node(rsc_id, rsc_id, "remote", NULL, scheduler);
1924         }
1925         link_rsc2remotenode(scheduler, rsc);
1926 
1927         if (node) {
1928             crm_trace("Setting node %s as shutting down due to orphaned connection resource", rsc_id);
1929             node->details->shutdown = TRUE;
1930         }
1931     }
1932 
1933     if (crm_element_value(rsc_entry, XML_RSC_ATTR_CONTAINER)) {
1934         /* This orphaned rsc needs to be mapped to a container. */
1935         crm_trace("Detected orphaned container filler %s", rsc_id);
1936         pe__set_resource_flags(rsc, pcmk_rsc_removed_filler);
1937     }
1938     pe__set_resource_flags(rsc, pcmk_rsc_removed);
1939     scheduler->resources = g_list_append(scheduler->resources, rsc);
1940     return rsc;
1941 }
1942 
1943 /*!
1944  * \internal
1945  * \brief Create orphan instance for anonymous clone resource history
1946  *
1947  * \param[in,out] parent     Clone resource that orphan will be added to
1948  * \param[in]     rsc_id     Orphan's resource ID
1949  * \param[in]     node       Where orphan is active (for logging only)
1950  * \param[in,out] scheduler  Scheduler data
1951  *
1952  * \return Newly added orphaned instance of \p parent
1953  */
1954 static pcmk_resource_t *
1955 create_anonymous_orphan(pcmk_resource_t *parent, const char *rsc_id,
     /* [previous][next][first][last][top][bottom][index][help] */
1956                         const pcmk_node_t *node, pcmk_scheduler_t *scheduler)
1957 {
1958     pcmk_resource_t *top = pe__create_clone_child(parent, scheduler);
1959 
1960     // find_rsc() because we might be a cloned group
1961     pcmk_resource_t *orphan = top->fns->find_rsc(top, rsc_id, NULL,
1962                                                pcmk_rsc_match_clone_only);
1963 
1964     pe_rsc_debug(parent, "Created orphan %s for %s: %s on %s",
1965                  top->id, parent->id, rsc_id, pe__node_name(node));
1966     return orphan;
1967 }
1968 
1969 /*!
1970  * \internal
1971  * \brief Check a node for an instance of an anonymous clone
1972  *
1973  * Return a child instance of the specified anonymous clone, in order of
1974  * preference: (1) the instance running on the specified node, if any;
1975  * (2) an inactive instance (i.e. within the total of clone-max instances);
1976  * (3) a newly created orphan (i.e. clone-max instances are already active).
1977  *
1978  * \param[in,out] scheduler  Scheduler data
1979  * \param[in]     node       Node on which to check for instance
1980  * \param[in,out] parent     Clone to check
1981  * \param[in]     rsc_id     Name of cloned resource in history (no instance)
1982  */
1983 static pcmk_resource_t *
1984 find_anonymous_clone(pcmk_scheduler_t *scheduler, const pcmk_node_t *node,
     /* [previous][next][first][last][top][bottom][index][help] */
1985                      pcmk_resource_t *parent, const char *rsc_id)
1986 {
1987     GList *rIter = NULL;
1988     pcmk_resource_t *rsc = NULL;
1989     pcmk_resource_t *inactive_instance = NULL;
1990     gboolean skip_inactive = FALSE;
1991 
1992     CRM_ASSERT(parent != NULL);
1993     CRM_ASSERT(pe_rsc_is_clone(parent));
1994     CRM_ASSERT(!pcmk_is_set(parent->flags, pcmk_rsc_unique));
1995 
1996     // Check for active (or partially active, for cloned groups) instance
1997     pe_rsc_trace(parent, "Looking for %s on %s in %s",
1998                  rsc_id, pe__node_name(node), parent->id);
1999     for (rIter = parent->children; rsc == NULL && rIter; rIter = rIter->next) {
2000         GList *locations = NULL;
2001         pcmk_resource_t *child = rIter->data;
2002 
2003         /* Check whether this instance is already known to be active or pending
2004          * anywhere, at this stage of unpacking. Because this function is called
2005          * for a resource before the resource's individual operation history
2006          * entries are unpacked, locations will generally not contain the
2007          * desired node.
2008          *
2009          * However, there are three exceptions:
2010          * (1) when child is a cloned group and we have already unpacked the
2011          *     history of another member of the group on the same node;
2012          * (2) when we've already unpacked the history of another numbered
2013          *     instance on the same node (which can happen if globally-unique
2014          *     was flipped from true to false); and
2015          * (3) when we re-run calculations on the same scheduler data as part of
2016          *     a simulation.
2017          */
2018         child->fns->location(child, &locations, 2);
2019         if (locations) {
2020             /* We should never associate the same numbered anonymous clone
2021              * instance with multiple nodes, and clone instances can't migrate,
2022              * so there must be only one location, regardless of history.
2023              */
2024             CRM_LOG_ASSERT(locations->next == NULL);
2025 
2026             if (((pcmk_node_t *) locations->data)->details == node->details) {
2027                 /* This child instance is active on the requested node, so check
2028                  * for a corresponding configured resource. We use find_rsc()
2029                  * instead of child because child may be a cloned group, and we
2030                  * need the particular member corresponding to rsc_id.
2031                  *
2032                  * If the history entry is orphaned, rsc will be NULL.
2033                  */
2034                 rsc = parent->fns->find_rsc(child, rsc_id, NULL,
2035                                             pcmk_rsc_match_clone_only);
2036                 if (rsc) {
2037                     /* If there are multiple instance history entries for an
2038                      * anonymous clone in a single node's history (which can
2039                      * happen if globally-unique is switched from true to
2040                      * false), we want to consider the instances beyond the
2041                      * first as orphans, even if there are inactive instance
2042                      * numbers available.
2043                      */
2044                     if (rsc->running_on) {
2045                         crm_notice("Active (now-)anonymous clone %s has "
2046                                    "multiple (orphan) instance histories on %s",
2047                                    parent->id, pe__node_name(node));
2048                         skip_inactive = TRUE;
2049                         rsc = NULL;
2050                     } else {
2051                         pe_rsc_trace(parent, "Resource %s, active", rsc->id);
2052                     }
2053                 }
2054             }
2055             g_list_free(locations);
2056 
2057         } else {
2058             pe_rsc_trace(parent, "Resource %s, skip inactive", child->id);
2059             if (!skip_inactive && !inactive_instance
2060                 && !pcmk_is_set(child->flags, pcmk_rsc_blocked)) {
2061                 // Remember one inactive instance in case we don't find active
2062                 inactive_instance = parent->fns->find_rsc(child, rsc_id, NULL,
2063                                                           pcmk_rsc_match_clone_only);
2064 
2065                 /* ... but don't use it if it was already associated with a
2066                  * pending action on another node
2067                  */
2068                 if (inactive_instance && inactive_instance->pending_node
2069                     && (inactive_instance->pending_node->details != node->details)) {
2070                     inactive_instance = NULL;
2071                 }
2072             }
2073         }
2074     }
2075 
2076     if ((rsc == NULL) && !skip_inactive && (inactive_instance != NULL)) {
2077         pe_rsc_trace(parent, "Resource %s, empty slot", inactive_instance->id);
2078         rsc = inactive_instance;
2079     }
2080 
2081     /* If the resource has "requires" set to "quorum" or "nothing", and we don't
2082      * have a clone instance for every node, we don't want to consume a valid
2083      * instance number for unclean nodes. Such instances may appear to be active
2084      * according to the history, but should be considered inactive, so we can
2085      * start an instance elsewhere. Treat such instances as orphans.
2086      *
2087      * An exception is instances running on guest nodes -- since guest node
2088      * "fencing" is actually just a resource stop, requires shouldn't apply.
2089      *
2090      * @TODO Ideally, we'd use an inactive instance number if it is not needed
2091      * for any clean instances. However, we don't know that at this point.
2092      */
2093     if ((rsc != NULL) && !pcmk_is_set(rsc->flags, pcmk_rsc_needs_fencing)
2094         && (!node->details->online || node->details->unclean)
2095         && !pe__is_guest_node(node)
2096         && !pe__is_universal_clone(parent, scheduler)) {
2097 
2098         rsc = NULL;
2099     }
2100 
2101     if (rsc == NULL) {
2102         rsc = create_anonymous_orphan(parent, rsc_id, node, scheduler);
2103         pe_rsc_trace(parent, "Resource %s, orphan", rsc->id);
2104     }
2105     return rsc;
2106 }
2107 
2108 static pcmk_resource_t *
2109 unpack_find_resource(pcmk_scheduler_t *scheduler, const pcmk_node_t *node,
     /* [previous][next][first][last][top][bottom][index][help] */
2110                      const char *rsc_id)
2111 {
2112     pcmk_resource_t *rsc = NULL;
2113     pcmk_resource_t *parent = NULL;
2114 
2115     crm_trace("looking for %s", rsc_id);
2116     rsc = pe_find_resource(scheduler->resources, rsc_id);
2117 
2118     if (rsc == NULL) {
2119         /* If we didn't find the resource by its name in the operation history,
2120          * check it again as a clone instance. Even when clone-max=0, we create
2121          * a single :0 orphan to match against here.
2122          */
2123         char *clone0_id = clone_zero(rsc_id);
2124         pcmk_resource_t *clone0 = pe_find_resource(scheduler->resources,
2125                                                    clone0_id);
2126 
2127         if (clone0 && !pcmk_is_set(clone0->flags, pcmk_rsc_unique)) {
2128             rsc = clone0;
2129             parent = uber_parent(clone0);
2130             crm_trace("%s found as %s (%s)", rsc_id, clone0_id, parent->id);
2131         } else {
2132             crm_trace("%s is not known as %s either (orphan)",
2133                       rsc_id, clone0_id);
2134         }
2135         free(clone0_id);
2136 
2137     } else if (rsc->variant > pcmk_rsc_variant_primitive) {
2138         crm_trace("Resource history for %s is orphaned because it is no longer primitive",
2139                   rsc_id);
2140         return NULL;
2141 
2142     } else {
2143         parent = uber_parent(rsc);
2144     }
2145 
2146     if (pe_rsc_is_anon_clone(parent)) {
2147 
2148         if (pe_rsc_is_bundled(parent)) {
2149             rsc = pe__find_bundle_replica(parent->parent, node);
2150         } else {
2151             char *base = clone_strip(rsc_id);
2152 
2153             rsc = find_anonymous_clone(scheduler, node, parent, base);
2154             free(base);
2155             CRM_ASSERT(rsc != NULL);
2156         }
2157     }
2158 
2159     if (rsc && !pcmk__str_eq(rsc_id, rsc->id, pcmk__str_casei)
2160         && !pcmk__str_eq(rsc_id, rsc->clone_name, pcmk__str_casei)) {
2161 
2162         pcmk__str_update(&rsc->clone_name, rsc_id);
2163         pe_rsc_debug(rsc, "Internally renamed %s on %s to %s%s",
2164                      rsc_id, pe__node_name(node), rsc->id,
2165                      (pcmk_is_set(rsc->flags, pcmk_rsc_removed)? " (ORPHAN)" : ""));
2166     }
2167     return rsc;
2168 }
2169 
2170 static pcmk_resource_t *
2171 process_orphan_resource(const xmlNode *rsc_entry, const pcmk_node_t *node,
     /* [previous][next][first][last][top][bottom][index][help] */
2172                         pcmk_scheduler_t *scheduler)
2173 {
2174     pcmk_resource_t *rsc = NULL;
2175     const char *rsc_id = crm_element_value(rsc_entry, XML_ATTR_ID);
2176 
2177     crm_debug("Detected orphan resource %s on %s", rsc_id, pe__node_name(node));
2178     rsc = create_fake_resource(rsc_id, rsc_entry, scheduler);
2179     if (rsc == NULL) {
2180         return NULL;
2181     }
2182 
2183     if (!pcmk_is_set(scheduler->flags, pcmk_sched_stop_removed_resources)) {
2184         pe__clear_resource_flags(rsc, pcmk_rsc_managed);
2185 
2186     } else {
2187         CRM_CHECK(rsc != NULL, return NULL);
2188         pe_rsc_trace(rsc, "Added orphan %s", rsc->id);
2189         resource_location(rsc, NULL, -INFINITY, "__orphan_do_not_run__",
2190                           scheduler);
2191     }
2192     return rsc;
2193 }
2194 
2195 static void
2196 process_rsc_state(pcmk_resource_t *rsc, pcmk_node_t *node,
     /* [previous][next][first][last][top][bottom][index][help] */
2197                   enum action_fail_response on_fail)
2198 {
2199     pcmk_node_t *tmpnode = NULL;
2200     char *reason = NULL;
2201     enum action_fail_response save_on_fail = pcmk_on_fail_ignore;
2202 
2203     CRM_ASSERT(rsc);
2204     pe_rsc_trace(rsc, "Resource %s is %s on %s: on_fail=%s",
2205                  rsc->id, role2text(rsc->role), pe__node_name(node),
2206                  fail2text(on_fail));
2207 
2208     /* process current state */
2209     if (rsc->role != pcmk_role_unknown) {
2210         pcmk_resource_t *iter = rsc;
2211 
2212         while (iter) {
2213             if (g_hash_table_lookup(iter->known_on, node->details->id) == NULL) {
2214                 pcmk_node_t *n = pe__copy_node(node);
2215 
2216                 pe_rsc_trace(rsc, "%s%s%s known on %s",
2217                              rsc->id,
2218                              ((rsc->clone_name == NULL)? "" : " also known as "),
2219                              ((rsc->clone_name == NULL)? "" : rsc->clone_name),
2220                              pe__node_name(n));
2221                 g_hash_table_insert(iter->known_on, (gpointer) n->details->id, n);
2222             }
2223             if (pcmk_is_set(iter->flags, pcmk_rsc_unique)) {
2224                 break;
2225             }
2226             iter = iter->parent;
2227         }
2228     }
2229 
2230     /* If a managed resource is believed to be running, but node is down ... */
2231     if ((rsc->role > pcmk_role_stopped)
2232         && node->details->online == FALSE
2233         && node->details->maintenance == FALSE
2234         && pcmk_is_set(rsc->flags, pcmk_rsc_managed)) {
2235 
2236         gboolean should_fence = FALSE;
2237 
2238         /* If this is a guest node, fence it (regardless of whether fencing is
2239          * enabled, because guest node fencing is done by recovery of the
2240          * container resource rather than by the fencer). Mark the resource
2241          * we're processing as failed. When the guest comes back up, its
2242          * operation history in the CIB will be cleared, freeing the affected
2243          * resource to run again once we are sure we know its state.
2244          */
2245         if (pe__is_guest_node(node)) {
2246             pe__set_resource_flags(rsc,
2247                                    pcmk_rsc_failed|pcmk_rsc_stop_if_failed);
2248             should_fence = TRUE;
2249 
2250         } else if (pcmk_is_set(rsc->cluster->flags,
2251                                pcmk_sched_fencing_enabled)) {
2252             if (pe__is_remote_node(node) && node->details->remote_rsc
2253                 && !pcmk_is_set(node->details->remote_rsc->flags,
2254                                 pcmk_rsc_failed)) {
2255 
2256                 /* Setting unseen means that fencing of the remote node will
2257                  * occur only if the connection resource is not going to start
2258                  * somewhere. This allows connection resources on a failed
2259                  * cluster node to move to another node without requiring the
2260                  * remote nodes to be fenced as well.
2261                  */
2262                 node->details->unseen = TRUE;
2263                 reason = crm_strdup_printf("%s is active there (fencing will be"
2264                                            " revoked if remote connection can "
2265                                            "be re-established elsewhere)",
2266                                            rsc->id);
2267             }
2268             should_fence = TRUE;
2269         }
2270 
2271         if (should_fence) {
2272             if (reason == NULL) {
2273                reason = crm_strdup_printf("%s is thought to be active there", rsc->id);
2274             }
2275             pe_fence_node(rsc->cluster, node, reason, FALSE);
2276         }
2277         free(reason);
2278     }
2279 
2280     /* In order to calculate priority_fencing_delay correctly, save the failure information and pass it to native_add_running(). */
2281     save_on_fail = on_fail;
2282 
2283     if (node->details->unclean) {
2284         /* No extra processing needed
2285          * Also allows resources to be started again after a node is shot
2286          */
2287         on_fail = pcmk_on_fail_ignore;
2288     }
2289 
2290     switch (on_fail) {
2291         case pcmk_on_fail_ignore:
2292             /* nothing to do */
2293             break;
2294 
2295         case pcmk_on_fail_demote:
2296             pe__set_resource_flags(rsc, pcmk_rsc_failed);
2297             demote_action(rsc, node, FALSE);
2298             break;
2299 
2300         case pcmk_on_fail_fence_node:
2301             /* treat it as if it is still running
2302              * but also mark the node as unclean
2303              */
2304             reason = crm_strdup_printf("%s failed there", rsc->id);
2305             pe_fence_node(rsc->cluster, node, reason, FALSE);
2306             free(reason);
2307             break;
2308 
2309         case pcmk_on_fail_standby_node:
2310             node->details->standby = TRUE;
2311             node->details->standby_onfail = TRUE;
2312             break;
2313 
2314         case pcmk_on_fail_block:
2315             /* is_managed == FALSE will prevent any
2316              * actions being sent for the resource
2317              */
2318             pe__clear_resource_flags(rsc, pcmk_rsc_managed);
2319             pe__set_resource_flags(rsc, pcmk_rsc_blocked);
2320             break;
2321 
2322         case pcmk_on_fail_ban:
2323             /* make sure it comes up somewhere else
2324              * or not at all
2325              */
2326             resource_location(rsc, node, -INFINITY, "__action_migration_auto__",
2327                               rsc->cluster);
2328             break;
2329 
2330         case pcmk_on_fail_stop:
2331             pe__set_next_role(rsc, pcmk_role_stopped, "on-fail=stop");
2332             break;
2333 
2334         case pcmk_on_fail_restart:
2335             if ((rsc->role != pcmk_role_stopped)
2336                 && (rsc->role != pcmk_role_unknown)) {
2337                 pe__set_resource_flags(rsc,
2338                                        pcmk_rsc_failed|pcmk_rsc_stop_if_failed);
2339                 stop_action(rsc, node, FALSE);
2340             }
2341             break;
2342 
2343         case pcmk_on_fail_restart_container:
2344             pe__set_resource_flags(rsc,
2345                                    pcmk_rsc_failed|pcmk_rsc_stop_if_failed);
2346             if (rsc->container && pe_rsc_is_bundled(rsc)) {
2347                 /* A bundle's remote connection can run on a different node than
2348                  * the bundle's container. We don't necessarily know where the
2349                  * container is running yet, so remember it and add a stop
2350                  * action for it later.
2351                  */
2352                 rsc->cluster->stop_needed =
2353                     g_list_prepend(rsc->cluster->stop_needed, rsc->container);
2354             } else if (rsc->container) {
2355                 stop_action(rsc->container, node, FALSE);
2356             } else if ((rsc->role != pcmk_role_stopped)
2357                        && (rsc->role != pcmk_role_unknown)) {
2358                 stop_action(rsc, node, FALSE);
2359             }
2360             break;
2361 
2362         case pcmk_on_fail_reset_remote:
2363             pe__set_resource_flags(rsc,
2364                                    pcmk_rsc_failed|pcmk_rsc_stop_if_failed);
2365             if (pcmk_is_set(rsc->cluster->flags, pcmk_sched_fencing_enabled)) {
2366                 tmpnode = NULL;
2367                 if (rsc->is_remote_node) {
2368                     tmpnode = pe_find_node(rsc->cluster->nodes, rsc->id);
2369                 }
2370                 if (tmpnode &&
2371                     pe__is_remote_node(tmpnode) &&
2372                     tmpnode->details->remote_was_fenced == 0) {
2373 
2374                     /* The remote connection resource failed in a way that
2375                      * should result in fencing the remote node.
2376                      */
2377                     pe_fence_node(rsc->cluster, tmpnode,
2378                                   "remote connection is unrecoverable", FALSE);
2379                 }
2380             }
2381 
2382             /* require the stop action regardless if fencing is occurring or not. */
2383             if (rsc->role > pcmk_role_stopped) {
2384                 stop_action(rsc, node, FALSE);
2385             }
2386 
2387             /* if reconnect delay is in use, prevent the connection from exiting the
2388              * "STOPPED" role until the failure is cleared by the delay timeout. */
2389             if (rsc->remote_reconnect_ms) {
2390                 pe__set_next_role(rsc, pcmk_role_stopped, "remote reset");
2391             }
2392             break;
2393     }
2394 
2395     /* ensure a remote-node connection failure forces an unclean remote-node
2396      * to be fenced. By setting unseen = FALSE, the remote-node failure will
2397      * result in a fencing operation regardless if we're going to attempt to 
2398      * reconnect to the remote-node in this transition or not. */
2399     if (pcmk_is_set(rsc->flags, pcmk_rsc_failed) && rsc->is_remote_node) {
2400         tmpnode = pe_find_node(rsc->cluster->nodes, rsc->id);
2401         if (tmpnode && tmpnode->details->unclean) {
2402             tmpnode->details->unseen = FALSE;
2403         }
2404     }
2405 
2406     if ((rsc->role != pcmk_role_stopped)
2407         && (rsc->role != pcmk_role_unknown)) {
2408         if (pcmk_is_set(rsc->flags, pcmk_rsc_removed)) {
2409             if (pcmk_is_set(rsc->flags, pcmk_rsc_managed)) {
2410                 pcmk__config_warn("Detected active orphan %s running on %s",
2411                                   rsc->id, pe__node_name(node));
2412             } else {
2413                 pcmk__config_warn("Resource '%s' must be stopped manually on "
2414                                   "%s because cluster is configured not to "
2415                                   "stop active orphans",
2416                                   rsc->id, pe__node_name(node));
2417             }
2418         }
2419 
2420         native_add_running(rsc, node, rsc->cluster,
2421                            (save_on_fail != pcmk_on_fail_ignore));
2422         switch (on_fail) {
2423             case pcmk_on_fail_ignore:
2424                 break;
2425             case pcmk_on_fail_demote:
2426             case pcmk_on_fail_block:
2427                 pe__set_resource_flags(rsc, pcmk_rsc_failed);
2428                 break;
2429             default:
2430                 pe__set_resource_flags(rsc,
2431                                        pcmk_rsc_failed|pcmk_rsc_stop_if_failed);
2432                 break;
2433         }
2434 
2435     } else if (rsc->clone_name && strchr(rsc->clone_name, ':') != NULL) {
2436         /* Only do this for older status sections that included instance numbers
2437          * Otherwise stopped instances will appear as orphans
2438          */
2439         pe_rsc_trace(rsc, "Resetting clone_name %s for %s (stopped)", rsc->clone_name, rsc->id);
2440         free(rsc->clone_name);
2441         rsc->clone_name = NULL;
2442 
2443     } else {
2444         GList *possible_matches = pe__resource_actions(rsc, node,
2445                                                        PCMK_ACTION_STOP, FALSE);
2446         GList *gIter = possible_matches;
2447 
2448         for (; gIter != NULL; gIter = gIter->next) {
2449             pcmk_action_t *stop = (pcmk_action_t *) gIter->data;
2450 
2451             pe__set_action_flags(stop, pcmk_action_optional);
2452         }
2453 
2454         g_list_free(possible_matches);
2455     }
2456 
2457     /* A successful stop after migrate_to on the migration source doesn't make
2458      * the partially migrated resource stopped on the migration target.
2459      */
2460     if ((rsc->role == pcmk_role_stopped)
2461         && rsc->partial_migration_source
2462         && rsc->partial_migration_source->details == node->details
2463         && rsc->partial_migration_target
2464         && rsc->running_on) {
2465 
2466         rsc->role = pcmk_role_started;
2467     }
2468 }
2469 
2470 /* create active recurring operations as optional */
2471 static void
2472 process_recurring(pcmk_node_t *node, pcmk_resource_t *rsc,
     /* [previous][next][first][last][top][bottom][index][help] */
2473                   int start_index, int stop_index,
2474                   GList *sorted_op_list, pcmk_scheduler_t *scheduler)
2475 {
2476     int counter = -1;
2477     const char *task = NULL;
2478     const char *status = NULL;
2479     GList *gIter = sorted_op_list;
2480 
2481     CRM_ASSERT(rsc);
2482     pe_rsc_trace(rsc, "%s: Start index %d, stop index = %d", rsc->id, start_index, stop_index);
2483 
2484     for (; gIter != NULL; gIter = gIter->next) {
2485         xmlNode *rsc_op = (xmlNode *) gIter->data;
2486 
2487         guint interval_ms = 0;
2488         char *key = NULL;
2489         const char *id = ID(rsc_op);
2490 
2491         counter++;
2492 
2493         if (node->details->online == FALSE) {
2494             pe_rsc_trace(rsc, "Skipping %s on %s: node is offline",
2495                          rsc->id, pe__node_name(node));
2496             break;
2497 
2498             /* Need to check if there's a monitor for role="Stopped" */
2499         } else if (start_index < stop_index && counter <= stop_index) {
2500             pe_rsc_trace(rsc, "Skipping %s on %s: resource is not active",
2501                          id, pe__node_name(node));
2502             continue;
2503 
2504         } else if (counter < start_index) {
2505             pe_rsc_trace(rsc, "Skipping %s on %s: old %d",
2506                          id, pe__node_name(node), counter);
2507             continue;
2508         }
2509 
2510         crm_element_value_ms(rsc_op, XML_LRM_ATTR_INTERVAL_MS, &interval_ms);
2511         if (interval_ms == 0) {
2512             pe_rsc_trace(rsc, "Skipping %s on %s: non-recurring",
2513                          id, pe__node_name(node));
2514             continue;
2515         }
2516 
2517         status = crm_element_value(rsc_op, XML_LRM_ATTR_OPSTATUS);
2518         if (pcmk__str_eq(status, "-1", pcmk__str_casei)) {
2519             pe_rsc_trace(rsc, "Skipping %s on %s: status",
2520                          id, pe__node_name(node));
2521             continue;
2522         }
2523         task = crm_element_value(rsc_op, XML_LRM_ATTR_TASK);
2524         /* create the action */
2525         key = pcmk__op_key(rsc->id, task, interval_ms);
2526         pe_rsc_trace(rsc, "Creating %s on %s", key, pe__node_name(node));
2527         custom_action(rsc, key, task, node, TRUE, scheduler);
2528     }
2529 }
2530 
2531 void
2532 calculate_active_ops(const GList *sorted_op_list, int *start_index,
     /* [previous][next][first][last][top][bottom][index][help] */
2533                      int *stop_index)
2534 {
2535     int counter = -1;
2536     int implied_monitor_start = -1;
2537     int implied_clone_start = -1;
2538     const char *task = NULL;
2539     const char *status = NULL;
2540 
2541     *stop_index = -1;
2542     *start_index = -1;
2543 
2544     for (const GList *iter = sorted_op_list; iter != NULL; iter = iter->next) {
2545         const xmlNode *rsc_op = (const xmlNode *) iter->data;
2546 
2547         counter++;
2548 
2549         task = crm_element_value(rsc_op, XML_LRM_ATTR_TASK);
2550         status = crm_element_value(rsc_op, XML_LRM_ATTR_OPSTATUS);
2551 
2552         if (pcmk__str_eq(task, PCMK_ACTION_STOP, pcmk__str_casei)
2553             && pcmk__str_eq(status, "0", pcmk__str_casei)) {
2554             *stop_index = counter;
2555 
2556         } else if (pcmk__strcase_any_of(task, PCMK_ACTION_START,
2557                                         PCMK_ACTION_MIGRATE_FROM, NULL)) {
2558             *start_index = counter;
2559 
2560         } else if ((implied_monitor_start <= *stop_index)
2561                    && pcmk__str_eq(task, PCMK_ACTION_MONITOR,
2562                                    pcmk__str_casei)) {
2563             const char *rc = crm_element_value(rsc_op, XML_LRM_ATTR_RC);
2564 
2565             if (pcmk__strcase_any_of(rc, "0", "8", NULL)) {
2566                 implied_monitor_start = counter;
2567             }
2568         } else if (pcmk__strcase_any_of(task, PCMK_ACTION_PROMOTE,
2569                                         PCMK_ACTION_DEMOTE, NULL)) {
2570             implied_clone_start = counter;
2571         }
2572     }
2573 
2574     if (*start_index == -1) {
2575         if (implied_clone_start != -1) {
2576             *start_index = implied_clone_start;
2577         } else if (implied_monitor_start != -1) {
2578             *start_index = implied_monitor_start;
2579         }
2580     }
2581 }
2582 
2583 // If resource history entry has shutdown lock, remember lock node and time
2584 static void
2585 unpack_shutdown_lock(const xmlNode *rsc_entry, pcmk_resource_t *rsc,
     /* [previous][next][first][last][top][bottom][index][help] */
2586                      const pcmk_node_t *node, pcmk_scheduler_t *scheduler)
2587 {
2588     time_t lock_time = 0;   // When lock started (i.e. node shutdown time)
2589 
2590     if ((crm_element_value_epoch(rsc_entry, XML_CONFIG_ATTR_SHUTDOWN_LOCK,
2591                                  &lock_time) == pcmk_ok) && (lock_time != 0)) {
2592 
2593         if ((scheduler->shutdown_lock > 0)
2594             && (get_effective_time(scheduler)
2595                 > (lock_time + scheduler->shutdown_lock))) {
2596             pe_rsc_info(rsc, "Shutdown lock for %s on %s expired",
2597                         rsc->id, pe__node_name(node));
2598             pe__clear_resource_history(rsc, node);
2599         } else {
2600             /* @COMPAT I don't like breaking const signatures, but
2601              * rsc->lock_node should really be const -- we just can't change it
2602              * until the next API compatibility break.
2603              */
2604             rsc->lock_node = (pcmk_node_t *) node;
2605             rsc->lock_time = lock_time;
2606         }
2607     }
2608 }
2609 
2610 /*!
2611  * \internal
2612  * \brief Unpack one lrm_resource entry from a node's CIB status
2613  *
2614  * \param[in,out] node       Node whose status is being unpacked
2615  * \param[in]     rsc_entry  lrm_resource XML being unpacked
2616  * \param[in,out] scheduler  Scheduler data
2617  *
2618  * \return Resource corresponding to the entry, or NULL if no operation history
2619  */
2620 static pcmk_resource_t *
2621 unpack_lrm_resource(pcmk_node_t *node, const xmlNode *lrm_resource,
     /* [previous][next][first][last][top][bottom][index][help] */
2622                     pcmk_scheduler_t *scheduler)
2623 {
2624     GList *gIter = NULL;
2625     int stop_index = -1;
2626     int start_index = -1;
2627     enum rsc_role_e req_role = pcmk_role_unknown;
2628 
2629     const char *rsc_id = ID(lrm_resource);
2630 
2631     pcmk_resource_t *rsc = NULL;
2632     GList *op_list = NULL;
2633     GList *sorted_op_list = NULL;
2634 
2635     xmlNode *rsc_op = NULL;
2636     xmlNode *last_failure = NULL;
2637 
2638     enum action_fail_response on_fail = pcmk_on_fail_ignore;
2639     enum rsc_role_e saved_role = pcmk_role_unknown;
2640 
2641     if (rsc_id == NULL) {
2642         crm_warn("Ignoring malformed " XML_LRM_TAG_RESOURCE
2643                  " entry without id");
2644         return NULL;
2645     }
2646     crm_trace("Unpacking " XML_LRM_TAG_RESOURCE " for %s on %s",
2647               rsc_id, pe__node_name(node));
2648 
2649     // Build a list of individual lrm_rsc_op entries, so we can sort them
2650     for (rsc_op = first_named_child(lrm_resource, XML_LRM_TAG_RSC_OP);
2651          rsc_op != NULL; rsc_op = crm_next_same_xml(rsc_op)) {
2652 
2653         op_list = g_list_prepend(op_list, rsc_op);
2654     }
2655 
2656     if (!pcmk_is_set(scheduler->flags, pcmk_sched_shutdown_lock)) {
2657         if (op_list == NULL) {
2658             // If there are no operations, there is nothing to do
2659             return NULL;
2660         }
2661     }
2662 
2663     /* find the resource */
2664     rsc = unpack_find_resource(scheduler, node, rsc_id);
2665     if (rsc == NULL) {
2666         if (op_list == NULL) {
2667             // If there are no operations, there is nothing to do
2668             return NULL;
2669         } else {
2670             rsc = process_orphan_resource(lrm_resource, node, scheduler);
2671         }
2672     }
2673     CRM_ASSERT(rsc != NULL);
2674 
2675     // Check whether the resource is "shutdown-locked" to this node
2676     if (pcmk_is_set(scheduler->flags, pcmk_sched_shutdown_lock)) {
2677         unpack_shutdown_lock(lrm_resource, rsc, node, scheduler);
2678     }
2679 
2680     /* process operations */
2681     saved_role = rsc->role;
2682     rsc->role = pcmk_role_unknown;
2683     sorted_op_list = g_list_sort(op_list, sort_op_by_callid);
2684 
2685     for (gIter = sorted_op_list; gIter != NULL; gIter = gIter->next) {
2686         xmlNode *rsc_op = (xmlNode *) gIter->data;
2687 
2688         unpack_rsc_op(rsc, node, rsc_op, &last_failure, &on_fail);
2689     }
2690 
2691     /* create active recurring operations as optional */
2692     calculate_active_ops(sorted_op_list, &start_index, &stop_index);
2693     process_recurring(node, rsc, start_index, stop_index, sorted_op_list,
2694                       scheduler);
2695 
2696     /* no need to free the contents */
2697     g_list_free(sorted_op_list);
2698 
2699     process_rsc_state(rsc, node, on_fail);
2700 
2701     if (get_target_role(rsc, &req_role)) {
2702         if ((rsc->next_role == pcmk_role_unknown)
2703             || (req_role < rsc->next_role)) {
2704 
2705             pe__set_next_role(rsc, req_role, XML_RSC_ATTR_TARGET_ROLE);
2706 
2707         } else if (req_role > rsc->next_role) {
2708             pe_rsc_info(rsc, "%s: Not overwriting calculated next role %s"
2709                         " with requested next role %s",
2710                         rsc->id, role2text(rsc->next_role), role2text(req_role));
2711         }
2712     }
2713 
2714     if (saved_role > rsc->role) {
2715         rsc->role = saved_role;
2716     }
2717 
2718     return rsc;
2719 }
2720 
2721 static void
2722 handle_orphaned_container_fillers(const xmlNode *lrm_rsc_list,
     /* [previous][next][first][last][top][bottom][index][help] */
2723                                   pcmk_scheduler_t *scheduler)
2724 {
2725     for (const xmlNode *rsc_entry = pcmk__xe_first_child(lrm_rsc_list);
2726          rsc_entry != NULL; rsc_entry = pcmk__xe_next(rsc_entry)) {
2727 
2728         pcmk_resource_t *rsc;
2729         pcmk_resource_t *container;
2730         const char *rsc_id;
2731         const char *container_id;
2732 
2733         if (!pcmk__str_eq((const char *)rsc_entry->name, XML_LRM_TAG_RESOURCE, pcmk__str_casei)) {
2734             continue;
2735         }
2736 
2737         container_id = crm_element_value(rsc_entry, XML_RSC_ATTR_CONTAINER);
2738         rsc_id = crm_element_value(rsc_entry, XML_ATTR_ID);
2739         if (container_id == NULL || rsc_id == NULL) {
2740             continue;
2741         }
2742 
2743         container = pe_find_resource(scheduler->resources, container_id);
2744         if (container == NULL) {
2745             continue;
2746         }
2747 
2748         rsc = pe_find_resource(scheduler->resources, rsc_id);
2749         if ((rsc == NULL) || (rsc->container != NULL)
2750             || !pcmk_is_set(rsc->flags, pcmk_rsc_removed_filler)) {
2751             continue;
2752         }
2753 
2754         pe_rsc_trace(rsc, "Mapped container of orphaned resource %s to %s",
2755                      rsc->id, container_id);
2756         rsc->container = container;
2757         container->fillers = g_list_append(container->fillers, rsc);
2758     }
2759 }
2760 
2761 /*!
2762  * \internal
2763  * \brief Unpack one node's lrm status section
2764  *
2765  * \param[in,out] node       Node whose status is being unpacked
2766  * \param[in]     xml        CIB node state XML
2767  * \param[in,out] scheduler  Scheduler data
2768  */
2769 static void
2770 unpack_node_lrm(pcmk_node_t *node, const xmlNode *xml,
     /* [previous][next][first][last][top][bottom][index][help] */
2771                 pcmk_scheduler_t *scheduler)
2772 {
2773     bool found_orphaned_container_filler = false;
2774 
2775     // Drill down to lrm_resources section
2776     xml = find_xml_node(xml, XML_CIB_TAG_LRM, FALSE);
2777     if (xml == NULL) {
2778         return;
2779     }
2780     xml = find_xml_node(xml, XML_LRM_TAG_RESOURCES, FALSE);
2781     if (xml == NULL) {
2782         return;
2783     }
2784 
2785     // Unpack each lrm_resource entry
2786     for (const xmlNode *rsc_entry = first_named_child(xml, XML_LRM_TAG_RESOURCE);
2787          rsc_entry != NULL; rsc_entry = crm_next_same_xml(rsc_entry)) {
2788 
2789         pcmk_resource_t *rsc = unpack_lrm_resource(node, rsc_entry, scheduler);
2790 
2791         if ((rsc != NULL)
2792             && pcmk_is_set(rsc->flags, pcmk_rsc_removed_filler)) {
2793             found_orphaned_container_filler = true;
2794         }
2795     }
2796 
2797     /* Now that all resource state has been unpacked for this node, map any
2798      * orphaned container fillers to their container resource.
2799      */
2800     if (found_orphaned_container_filler) {
2801         handle_orphaned_container_fillers(xml, scheduler);
2802     }
2803 }
2804 
2805 static void
2806 set_active(pcmk_resource_t *rsc)
     /* [previous][next][first][last][top][bottom][index][help] */
2807 {
2808     const pcmk_resource_t *top = pe__const_top_resource(rsc, false);
2809 
2810     if (top && pcmk_is_set(top->flags, pcmk_rsc_promotable)) {
2811         rsc->role = pcmk_role_unpromoted;
2812     } else {
2813         rsc->role = pcmk_role_started;
2814     }
2815 }
2816 
2817 static void
2818 set_node_score(gpointer key, gpointer value, gpointer user_data)
     /* [previous][next][first][last][top][bottom][index][help] */
2819 {
2820     pcmk_node_t *node = value;
2821     int *score = user_data;
2822 
2823     node->weight = *score;
2824 }
2825 
2826 #define XPATH_NODE_STATE "/" XML_TAG_CIB "/" XML_CIB_TAG_STATUS     \
2827                          "/" XML_CIB_TAG_STATE
2828 #define SUB_XPATH_LRM_RESOURCE "/" XML_CIB_TAG_LRM              \
2829                                "/" XML_LRM_TAG_RESOURCES        \
2830                                "/" XML_LRM_TAG_RESOURCE
2831 #define SUB_XPATH_LRM_RSC_OP "/" XML_LRM_TAG_RSC_OP
2832 
2833 static xmlNode *
2834 find_lrm_op(const char *resource, const char *op, const char *node, const char *source,
     /* [previous][next][first][last][top][bottom][index][help] */
2835             int target_rc, pcmk_scheduler_t *scheduler)
2836 {
2837     GString *xpath = NULL;
2838     xmlNode *xml = NULL;
2839 
2840     CRM_CHECK((resource != NULL) && (op != NULL) && (node != NULL),
2841               return NULL);
2842 
2843     xpath = g_string_sized_new(256);
2844     pcmk__g_strcat(xpath,
2845                    XPATH_NODE_STATE "[@" XML_ATTR_UNAME "='", node, "']"
2846                    SUB_XPATH_LRM_RESOURCE "[@" XML_ATTR_ID "='", resource, "']"
2847                    SUB_XPATH_LRM_RSC_OP "[@" XML_LRM_ATTR_TASK "='", op, "'",
2848                    NULL);
2849 
2850     /* Need to check against transition_magic too? */
2851     if ((source != NULL) && (strcmp(op, PCMK_ACTION_MIGRATE_TO) == 0)) {
2852         pcmk__g_strcat(xpath,
2853                        " and @" XML_LRM_ATTR_MIGRATE_TARGET "='", source, "']",
2854                        NULL);
2855 
2856     } else if ((source != NULL)
2857                && (strcmp(op, PCMK_ACTION_MIGRATE_FROM) == 0)) {
2858         pcmk__g_strcat(xpath,
2859                        " and @" XML_LRM_ATTR_MIGRATE_SOURCE "='", source, "']",
2860                        NULL);
2861     } else {
2862         g_string_append_c(xpath, ']');
2863     }
2864 
2865     xml = get_xpath_object((const char *) xpath->str, scheduler->input,
2866                            LOG_DEBUG);
2867     g_string_free(xpath, TRUE);
2868 
2869     if (xml && target_rc >= 0) {
2870         int rc = PCMK_OCF_UNKNOWN_ERROR;
2871         int status = PCMK_EXEC_ERROR;
2872 
2873         crm_element_value_int(xml, XML_LRM_ATTR_RC, &rc);
2874         crm_element_value_int(xml, XML_LRM_ATTR_OPSTATUS, &status);
2875         if ((rc != target_rc) || (status != PCMK_EXEC_DONE)) {
2876             return NULL;
2877         }
2878     }
2879     return xml;
2880 }
2881 
2882 static xmlNode *
2883 find_lrm_resource(const char *rsc_id, const char *node_name,
     /* [previous][next][first][last][top][bottom][index][help] */
2884                   pcmk_scheduler_t *scheduler)
2885 {
2886     GString *xpath = NULL;
2887     xmlNode *xml = NULL;
2888 
2889     CRM_CHECK((rsc_id != NULL) && (node_name != NULL), return NULL);
2890 
2891     xpath = g_string_sized_new(256);
2892     pcmk__g_strcat(xpath,
2893                    XPATH_NODE_STATE "[@" XML_ATTR_UNAME "='", node_name, "']"
2894                    SUB_XPATH_LRM_RESOURCE "[@" XML_ATTR_ID "='", rsc_id, "']",
2895                    NULL);
2896 
2897     xml = get_xpath_object((const char *) xpath->str, scheduler->input,
2898                            LOG_DEBUG);
2899 
2900     g_string_free(xpath, TRUE);
2901     return xml;
2902 }
2903 
2904 /*!
2905  * \internal
2906  * \brief Check whether a resource has no completed action history on a node
2907  *
2908  * \param[in,out] rsc        Resource to check
2909  * \param[in]     node_name  Node to check
2910  *
2911  * \return true if \p rsc_id is unknown on \p node_name, otherwise false
2912  */
2913 static bool
2914 unknown_on_node(pcmk_resource_t *rsc, const char *node_name)
     /* [previous][next][first][last][top][bottom][index][help] */
2915 {
2916     bool result = false;
2917     xmlXPathObjectPtr search;
2918     GString *xpath = g_string_sized_new(256);
2919 
2920     pcmk__g_strcat(xpath,
2921                    XPATH_NODE_STATE "[@" XML_ATTR_UNAME "='", node_name, "']"
2922                    SUB_XPATH_LRM_RESOURCE "[@" XML_ATTR_ID "='", rsc->id, "']"
2923                    SUB_XPATH_LRM_RSC_OP "[@" XML_LRM_ATTR_RC "!='193']",
2924                    NULL);
2925     search = xpath_search(rsc->cluster->input, (const char *) xpath->str);
2926     result = (numXpathResults(search) == 0);
2927     freeXpathObject(search);
2928     g_string_free(xpath, TRUE);
2929     return result;
2930 }
2931 
2932 /*!
2933  * \brief Check whether a probe/monitor indicating the resource was not running
2934  * on a node happened after some event
2935  *
2936  * \param[in]     rsc_id     Resource being checked
2937  * \param[in]     node_name  Node being checked
2938  * \param[in]     xml_op     Event that monitor is being compared to
2939  * \param[in]     same_node  Whether the operations are on the same node
2940  * \param[in,out] scheduler  Scheduler data
2941  *
2942  * \return true if such a monitor happened after event, false otherwise
2943  */
2944 static bool
2945 monitor_not_running_after(const char *rsc_id, const char *node_name,
     /* [previous][next][first][last][top][bottom][index][help] */
2946                           const xmlNode *xml_op, bool same_node,
2947                           pcmk_scheduler_t *scheduler)
2948 {
2949     /* Any probe/monitor operation on the node indicating it was not running
2950      * there
2951      */
2952     xmlNode *monitor = find_lrm_op(rsc_id, PCMK_ACTION_MONITOR, node_name,
2953                                    NULL, PCMK_OCF_NOT_RUNNING, scheduler);
2954 
2955     return (monitor && pe__is_newer_op(monitor, xml_op, same_node) > 0);
2956 }
2957 
2958 /*!
2959  * \brief Check whether any non-monitor operation on a node happened after some
2960  * event
2961  *
2962  * \param[in]     rsc_id     Resource being checked
2963  * \param[in]     node_name  Node being checked
2964  * \param[in]     xml_op     Event that non-monitor is being compared to
2965  * \param[in]     same_node  Whether the operations are on the same node
2966  * \param[in,out] scheduler  Scheduler data
2967  *
2968  * \return true if such a operation happened after event, false otherwise
2969  */
2970 static bool
2971 non_monitor_after(const char *rsc_id, const char *node_name,
     /* [previous][next][first][last][top][bottom][index][help] */
2972                   const xmlNode *xml_op, bool same_node,
2973                   pcmk_scheduler_t *scheduler)
2974 {
2975     xmlNode *lrm_resource = NULL;
2976 
2977     lrm_resource = find_lrm_resource(rsc_id, node_name, scheduler);
2978     if (lrm_resource == NULL) {
2979         return false;
2980     }
2981 
2982     for (xmlNode *op = first_named_child(lrm_resource, XML_LRM_TAG_RSC_OP);
2983          op != NULL; op = crm_next_same_xml(op)) {
2984         const char * task = NULL;
2985 
2986         if (op == xml_op) {
2987             continue;
2988         }
2989 
2990         task = crm_element_value(op, XML_LRM_ATTR_TASK);
2991 
2992         if (pcmk__str_any_of(task, PCMK_ACTION_START, PCMK_ACTION_STOP,
2993                              PCMK_ACTION_MIGRATE_TO, PCMK_ACTION_MIGRATE_FROM,
2994                              NULL)
2995             && pe__is_newer_op(op, xml_op, same_node) > 0) {
2996             return true;
2997         }
2998     }
2999 
3000     return false;
3001 }
3002 
3003 /*!
3004  * \brief Check whether the resource has newer state on a node after a migration
3005  * attempt
3006  *
3007  * \param[in]     rsc_id        Resource being checked
3008  * \param[in]     node_name     Node being checked
3009  * \param[in]     migrate_to    Any migrate_to event that is being compared to
3010  * \param[in]     migrate_from  Any migrate_from event that is being compared to
3011  * \param[in,out] scheduler     Scheduler data
3012  *
3013  * \return true if such a operation happened after event, false otherwise
3014  */
3015 static bool
3016 newer_state_after_migrate(const char *rsc_id, const char *node_name,
     /* [previous][next][first][last][top][bottom][index][help] */
3017                           const xmlNode *migrate_to,
3018                           const xmlNode *migrate_from,
3019                           pcmk_scheduler_t *scheduler)
3020 {
3021     const xmlNode *xml_op = migrate_to;
3022     const char *source = NULL;
3023     const char *target = NULL;
3024     bool same_node = false;
3025 
3026     if (migrate_from) {
3027         xml_op = migrate_from;
3028     }
3029 
3030     source = crm_element_value(xml_op, XML_LRM_ATTR_MIGRATE_SOURCE);
3031     target = crm_element_value(xml_op, XML_LRM_ATTR_MIGRATE_TARGET);
3032 
3033     /* It's preferred to compare to the migrate event on the same node if
3034      * existing, since call ids are more reliable.
3035      */
3036     if (pcmk__str_eq(node_name, target, pcmk__str_casei)) {
3037         if (migrate_from) {
3038            xml_op = migrate_from;
3039            same_node = true;
3040 
3041         } else {
3042            xml_op = migrate_to;
3043         }
3044 
3045     } else if (pcmk__str_eq(node_name, source, pcmk__str_casei)) {
3046         if (migrate_to) {
3047            xml_op = migrate_to;
3048            same_node = true;
3049 
3050         } else {
3051            xml_op = migrate_from;
3052         }
3053     }
3054 
3055     /* If there's any newer non-monitor operation on the node, or any newer
3056      * probe/monitor operation on the node indicating it was not running there,
3057      * the migration events potentially no longer matter for the node.
3058      */
3059     return non_monitor_after(rsc_id, node_name, xml_op, same_node, scheduler)
3060            || monitor_not_running_after(rsc_id, node_name, xml_op, same_node,
3061                                         scheduler);
3062 }
3063 
3064 /*!
3065  * \internal
3066  * \brief Parse migration source and target node names from history entry
3067  *
3068  * \param[in]  entry        Resource history entry for a migration action
3069  * \param[in]  source_node  If not NULL, source must match this node
3070  * \param[in]  target_node  If not NULL, target must match this node
3071  * \param[out] source_name  Where to store migration source node name
3072  * \param[out] target_name  Where to store migration target node name
3073  *
3074  * \return Standard Pacemaker return code
3075  */
3076 static int
3077 get_migration_node_names(const xmlNode *entry, const pcmk_node_t *source_node,
     /* [previous][next][first][last][top][bottom][index][help] */
3078                          const pcmk_node_t *target_node,
3079                          const char **source_name, const char **target_name)
3080 {
3081     *source_name = crm_element_value(entry, XML_LRM_ATTR_MIGRATE_SOURCE);
3082     *target_name = crm_element_value(entry, XML_LRM_ATTR_MIGRATE_TARGET);
3083     if ((*source_name == NULL) || (*target_name == NULL)) {
3084         crm_err("Ignoring resource history entry %s without "
3085                 XML_LRM_ATTR_MIGRATE_SOURCE " and " XML_LRM_ATTR_MIGRATE_TARGET,
3086                 ID(entry));
3087         return pcmk_rc_unpack_error;
3088     }
3089 
3090     if ((source_node != NULL)
3091         && !pcmk__str_eq(*source_name, source_node->details->uname,
3092                          pcmk__str_casei|pcmk__str_null_matches)) {
3093         crm_err("Ignoring resource history entry %s because "
3094                 XML_LRM_ATTR_MIGRATE_SOURCE "='%s' does not match %s",
3095                 ID(entry), *source_name, pe__node_name(source_node));
3096         return pcmk_rc_unpack_error;
3097     }
3098 
3099     if ((target_node != NULL)
3100         && !pcmk__str_eq(*target_name, target_node->details->uname,
3101                          pcmk__str_casei|pcmk__str_null_matches)) {
3102         crm_err("Ignoring resource history entry %s because "
3103                 XML_LRM_ATTR_MIGRATE_TARGET "='%s' does not match %s",
3104                 ID(entry), *target_name, pe__node_name(target_node));
3105         return pcmk_rc_unpack_error;
3106     }
3107 
3108     return pcmk_rc_ok;
3109 }
3110 
3111 /*
3112  * \internal
3113  * \brief Add a migration source to a resource's list of dangling migrations
3114  *
3115  * If the migrate_to and migrate_from actions in a live migration both
3116  * succeeded, but there is no stop on the source, the migration is considered
3117  * "dangling." Add the source to the resource's dangling migration list, which
3118  * will be used to schedule a stop on the source without affecting the target.
3119  *
3120  * \param[in,out] rsc   Resource involved in migration
3121  * \param[in]     node  Migration source
3122  */
3123 static void
3124 add_dangling_migration(pcmk_resource_t *rsc, const pcmk_node_t *node)
     /* [previous][next][first][last][top][bottom][index][help] */
3125 {
3126     pe_rsc_trace(rsc, "Dangling migration of %s requires stop on %s",
3127                  rsc->id, pe__node_name(node));
3128     rsc->role = pcmk_role_stopped;
3129     rsc->dangling_migrations = g_list_prepend(rsc->dangling_migrations,
3130                                               (gpointer) node);
3131 }
3132 
3133 /*!
3134  * \internal
3135  * \brief Update resource role etc. after a successful migrate_to action
3136  *
3137  * \param[in,out] history  Parsed action result history
3138  */
3139 static void
3140 unpack_migrate_to_success(struct action_history *history)
     /* [previous][next][first][last][top][bottom][index][help] */
3141 {
3142     /* A complete migration sequence is:
3143      * 1. migrate_to on source node (which succeeded if we get to this function)
3144      * 2. migrate_from on target node
3145      * 3. stop on source node
3146      *
3147      * If no migrate_from has happened, the migration is considered to be
3148      * "partial". If the migrate_from succeeded but no stop has happened, the
3149      * migration is considered to be "dangling".
3150      *
3151      * If a successful migrate_to and stop have happened on the source node, we
3152      * still need to check for a partial migration, due to scenarios (easier to
3153      * produce with batch-limit=1) like:
3154      *
3155      * - A resource is migrating from node1 to node2, and a migrate_to is
3156      *   initiated for it on node1.
3157      *
3158      * - node2 goes into standby mode while the migrate_to is pending, which
3159      *   aborts the transition.
3160      *
3161      * - Upon completion of the migrate_to, a new transition schedules a stop
3162      *   on both nodes and a start on node1.
3163      *
3164      * - If the new transition is aborted for any reason while the resource is
3165      *   stopping on node1, the transition after that stop completes will see
3166      *   the migrate_to and stop on the source, but it's still a partial
3167      *   migration, and the resource must be stopped on node2 because it is
3168      *   potentially active there due to the migrate_to.
3169      *
3170      *   We also need to take into account that either node's history may be
3171      *   cleared at any point in the migration process.
3172      */
3173     int from_rc = PCMK_OCF_OK;
3174     int from_status = PCMK_EXEC_PENDING;
3175     pcmk_node_t *target_node = NULL;
3176     xmlNode *migrate_from = NULL;
3177     const char *source = NULL;
3178     const char *target = NULL;
3179     bool source_newer_op = false;
3180     bool target_newer_state = false;
3181     bool active_on_target = false;
3182 
3183     // Get source and target node names from XML
3184     if (get_migration_node_names(history->xml, history->node, NULL, &source,
3185                                  &target) != pcmk_rc_ok) {
3186         return;
3187     }
3188 
3189     // Check for newer state on the source
3190     source_newer_op = non_monitor_after(history->rsc->id, source, history->xml,
3191                                         true, history->rsc->cluster);
3192 
3193     // Check for a migrate_from action from this source on the target
3194     migrate_from = find_lrm_op(history->rsc->id, PCMK_ACTION_MIGRATE_FROM,
3195                                target, source, -1, history->rsc->cluster);
3196     if (migrate_from != NULL) {
3197         if (source_newer_op) {
3198             /* There's a newer non-monitor operation on the source and a
3199              * migrate_from on the target, so this migrate_to is irrelevant to
3200              * the resource's state.
3201              */
3202             return;
3203         }
3204         crm_element_value_int(migrate_from, XML_LRM_ATTR_RC, &from_rc);
3205         crm_element_value_int(migrate_from, XML_LRM_ATTR_OPSTATUS,
3206                               &from_status);
3207     }
3208 
3209     /* If the resource has newer state on both the source and target after the
3210      * migration events, this migrate_to is irrelevant to the resource's state.
3211      */
3212     target_newer_state = newer_state_after_migrate(history->rsc->id, target,
3213                                                    history->xml, migrate_from,
3214                                                    history->rsc->cluster);
3215     if (source_newer_op && target_newer_state) {
3216         return;
3217     }
3218 
3219     /* Check for dangling migration (migrate_from succeeded but stop not done).
3220      * We know there's no stop because we already returned if the target has a
3221      * migrate_from and the source has any newer non-monitor operation.
3222      */
3223     if ((from_rc == PCMK_OCF_OK) && (from_status == PCMK_EXEC_DONE)) {
3224         add_dangling_migration(history->rsc, history->node);
3225         return;
3226     }
3227 
3228     /* Without newer state, this migrate_to implies the resource is active.
3229      * (Clones are not allowed to migrate, so role can't be promoted.)
3230      */
3231     history->rsc->role = pcmk_role_started;
3232 
3233     target_node = pe_find_node(history->rsc->cluster->nodes, target);
3234     active_on_target = !target_newer_state && (target_node != NULL)
3235                        && target_node->details->online;
3236 
3237     if (from_status != PCMK_EXEC_PENDING) { // migrate_from failed on target
3238         if (active_on_target) {
3239             native_add_running(history->rsc, target_node, history->rsc->cluster,
3240                                TRUE);
3241         } else {
3242             // Mark resource as failed, require recovery, and prevent migration
3243             pe__set_resource_flags(history->rsc,
3244                                    pcmk_rsc_failed|pcmk_rsc_stop_if_failed);
3245             pe__clear_resource_flags(history->rsc, pcmk_rsc_migratable);
3246         }
3247         return;
3248     }
3249 
3250     // The migrate_from is pending, complete but erased, or to be scheduled
3251 
3252     /* If there is no history at all for the resource on an online target, then
3253      * it was likely cleaned. Just return, and we'll schedule a probe. Once we
3254      * have the probe result, it will be reflected in target_newer_state.
3255      */
3256     if ((target_node != NULL) && target_node->details->online
3257         && unknown_on_node(history->rsc, target)) {
3258         return;
3259     }
3260 
3261     if (active_on_target) {
3262         pcmk_node_t *source_node = pe_find_node(history->rsc->cluster->nodes,
3263                                                 source);
3264 
3265         native_add_running(history->rsc, target_node, history->rsc->cluster,
3266                            FALSE);
3267         if ((source_node != NULL) && source_node->details->online) {
3268             /* This is a partial migration: the migrate_to completed
3269              * successfully on the source, but the migrate_from has not
3270              * completed. Remember the source and target; if the newly
3271              * chosen target remains the same when we schedule actions
3272              * later, we may continue with the migration.
3273              */
3274             history->rsc->partial_migration_target = target_node;
3275             history->rsc->partial_migration_source = source_node;
3276         }
3277 
3278     } else if (!source_newer_op) {
3279         // Mark resource as failed, require recovery, and prevent migration
3280         pe__set_resource_flags(history->rsc,
3281                                pcmk_rsc_failed|pcmk_rsc_stop_if_failed);
3282         pe__clear_resource_flags(history->rsc, pcmk_rsc_migratable);
3283     }
3284 }
3285 
3286 /*!
3287  * \internal
3288  * \brief Update resource role etc. after a failed migrate_to action
3289  *
3290  * \param[in,out] history  Parsed action result history
3291  */
3292 static void
3293 unpack_migrate_to_failure(struct action_history *history)
     /* [previous][next][first][last][top][bottom][index][help] */
3294 {
3295     xmlNode *target_migrate_from = NULL;
3296     const char *source = NULL;
3297     const char *target = NULL;
3298 
3299     // Get source and target node names from XML
3300     if (get_migration_node_names(history->xml, history->node, NULL, &source,
3301                                  &target) != pcmk_rc_ok) {
3302         return;
3303     }
3304 
3305     /* If a migration failed, we have to assume the resource is active. Clones
3306      * are not allowed to migrate, so role can't be promoted.
3307      */
3308     history->rsc->role = pcmk_role_started;
3309 
3310     // Check for migrate_from on the target
3311     target_migrate_from = find_lrm_op(history->rsc->id,
3312                                       PCMK_ACTION_MIGRATE_FROM, target, source,
3313                                       PCMK_OCF_OK, history->rsc->cluster);
3314 
3315     if (/* If the resource state is unknown on the target, it will likely be
3316          * probed there.
3317          * Don't just consider it running there. We will get back here anyway in
3318          * case the probe detects it's running there.
3319          */
3320         !unknown_on_node(history->rsc, target)
3321         /* If the resource has newer state on the target after the migration
3322          * events, this migrate_to no longer matters for the target.
3323          */
3324         && !newer_state_after_migrate(history->rsc->id, target, history->xml,
3325                                       target_migrate_from,
3326                                       history->rsc->cluster)) {
3327         /* The resource has no newer state on the target, so assume it's still
3328          * active there.
3329          * (if it is up).
3330          */
3331         pcmk_node_t *target_node = pe_find_node(history->rsc->cluster->nodes,
3332                                                 target);
3333 
3334         if (target_node && target_node->details->online) {
3335             native_add_running(history->rsc, target_node, history->rsc->cluster,
3336                                FALSE);
3337         }
3338 
3339     } else if (!non_monitor_after(history->rsc->id, source, history->xml, true,
3340                                   history->rsc->cluster)) {
3341         /* We know the resource has newer state on the target, but this
3342          * migrate_to still matters for the source as long as there's no newer
3343          * non-monitor operation there.
3344          */
3345 
3346         // Mark node as having dangling migration so we can force a stop later
3347         history->rsc->dangling_migrations =
3348             g_list_prepend(history->rsc->dangling_migrations,
3349                            (gpointer) history->node);
3350     }
3351 }
3352 
3353 /*!
3354  * \internal
3355  * \brief Update resource role etc. after a failed migrate_from action
3356  *
3357  * \param[in,out] history  Parsed action result history
3358  */
3359 static void
3360 unpack_migrate_from_failure(struct action_history *history)
     /* [previous][next][first][last][top][bottom][index][help] */
3361 {
3362     xmlNode *source_migrate_to = NULL;
3363     const char *source = NULL;
3364     const char *target = NULL;
3365 
3366     // Get source and target node names from XML
3367     if (get_migration_node_names(history->xml, NULL, history->node, &source,
3368                                  &target) != pcmk_rc_ok) {
3369         return;
3370     }
3371 
3372     /* If a migration failed, we have to assume the resource is active. Clones
3373      * are not allowed to migrate, so role can't be promoted.
3374      */
3375     history->rsc->role = pcmk_role_started;
3376 
3377     // Check for a migrate_to on the source
3378     source_migrate_to = find_lrm_op(history->rsc->id, PCMK_ACTION_MIGRATE_TO,
3379                                     source, target, PCMK_OCF_OK,
3380                                     history->rsc->cluster);
3381 
3382     if (/* If the resource state is unknown on the source, it will likely be
3383          * probed there.
3384          * Don't just consider it running there. We will get back here anyway in
3385          * case the probe detects it's running there.
3386          */
3387         !unknown_on_node(history->rsc, source)
3388         /* If the resource has newer state on the source after the migration
3389          * events, this migrate_from no longer matters for the source.
3390          */
3391         && !newer_state_after_migrate(history->rsc->id, source,
3392                                       source_migrate_to, history->xml,
3393                                       history->rsc->cluster)) {
3394         /* The resource has no newer state on the source, so assume it's still
3395          * active there (if it is up).
3396          */
3397         pcmk_node_t *source_node = pe_find_node(history->rsc->cluster->nodes,
3398                                                 source);
3399 
3400         if (source_node && source_node->details->online) {
3401             native_add_running(history->rsc, source_node, history->rsc->cluster,
3402                                TRUE);
3403         }
3404     }
3405 }
3406 
3407 /*!
3408  * \internal
3409  * \brief Add an action to cluster's list of failed actions
3410  *
3411  * \param[in,out] history  Parsed action result history
3412  */
3413 static void
3414 record_failed_op(struct action_history *history)
     /* [previous][next][first][last][top][bottom][index][help] */
3415 {
3416     if (!(history->node->details->online)) {
3417         return;
3418     }
3419 
3420     for (const xmlNode *xIter = history->rsc->cluster->failed->children;
3421          xIter != NULL; xIter = xIter->next) {
3422 
3423         const char *key = pe__xe_history_key(xIter);
3424         const char *uname = crm_element_value(xIter, XML_ATTR_UNAME);
3425 
3426         if (pcmk__str_eq(history->key, key, pcmk__str_none)
3427             && pcmk__str_eq(uname, history->node->details->uname,
3428                             pcmk__str_casei)) {
3429             crm_trace("Skipping duplicate entry %s on %s",
3430                       history->key, pe__node_name(history->node));
3431             return;
3432         }
3433     }
3434 
3435     crm_trace("Adding entry for %s on %s to failed action list",
3436               history->key, pe__node_name(history->node));
3437     crm_xml_add(history->xml, XML_ATTR_UNAME, history->node->details->uname);
3438     crm_xml_add(history->xml, XML_LRM_ATTR_RSCID, history->rsc->id);
3439     add_node_copy(history->rsc->cluster->failed, history->xml);
3440 }
3441 
3442 static char *
3443 last_change_str(const xmlNode *xml_op)
     /* [previous][next][first][last][top][bottom][index][help] */
3444 {
3445     time_t when;
3446     char *result = NULL;
3447 
3448     if (crm_element_value_epoch(xml_op, XML_RSC_OP_LAST_CHANGE,
3449                                 &when) == pcmk_ok) {
3450         char *when_s = pcmk__epoch2str(&when, 0);
3451         const char *p = strchr(when_s, ' ');
3452 
3453         // Skip day of week to make message shorter
3454         if ((p != NULL) && (*(++p) != '\0')) {
3455             result = strdup(p);
3456             CRM_ASSERT(result != NULL);
3457         }
3458         free(when_s);
3459     }
3460 
3461     if (result == NULL) {
3462         result = strdup("unknown time");
3463         CRM_ASSERT(result != NULL);
3464     }
3465 
3466     return result;
3467 }
3468 
3469 /*!
3470  * \internal
3471  * \brief Compare two on-fail values
3472  *
3473  * \param[in] first   One on-fail value to compare
3474  * \param[in] second  The other on-fail value to compare
3475  *
3476  * \return A negative number if second is more severe than first, zero if they
3477  *         are equal, or a positive number if first is more severe than second.
3478  * \note This is only needed until the action_fail_response values can be
3479  *       renumbered at the next API compatibility break.
3480  */
3481 static int
3482 cmp_on_fail(enum action_fail_response first, enum action_fail_response second)
     /* [previous][next][first][last][top][bottom][index][help] */
3483 {
3484     switch (first) {
3485         case pcmk_on_fail_demote:
3486             switch (second) {
3487                 case pcmk_on_fail_ignore:
3488                     return 1;
3489                 case pcmk_on_fail_demote:
3490                     return 0;
3491                 default:
3492                     return -1;
3493             }
3494             break;
3495 
3496         case pcmk_on_fail_reset_remote:
3497             switch (second) {
3498                 case pcmk_on_fail_ignore:
3499                 case pcmk_on_fail_demote:
3500                 case pcmk_on_fail_restart:
3501                     return 1;
3502                 case pcmk_on_fail_reset_remote:
3503                     return 0;
3504                 default:
3505                     return -1;
3506             }
3507             break;
3508 
3509         case pcmk_on_fail_restart_container:
3510             switch (second) {
3511                 case pcmk_on_fail_ignore:
3512                 case pcmk_on_fail_demote:
3513                 case pcmk_on_fail_restart:
3514                 case pcmk_on_fail_reset_remote:
3515                     return 1;
3516                 case pcmk_on_fail_restart_container:
3517                     return 0;
3518                 default:
3519                     return -1;
3520             }
3521             break;
3522 
3523         default:
3524             break;
3525     }
3526     switch (second) {
3527         case pcmk_on_fail_demote:
3528             return (first == pcmk_on_fail_ignore)? -1 : 1;
3529 
3530         case pcmk_on_fail_reset_remote:
3531             switch (first) {
3532                 case pcmk_on_fail_ignore:
3533                 case pcmk_on_fail_demote:
3534                 case pcmk_on_fail_restart:
3535                     return -1;
3536                 default:
3537                     return 1;
3538             }
3539             break;
3540 
3541         case pcmk_on_fail_restart_container:
3542             switch (first) {
3543                 case pcmk_on_fail_ignore:
3544                 case pcmk_on_fail_demote:
3545                 case pcmk_on_fail_restart:
3546                 case pcmk_on_fail_reset_remote:
3547                     return -1;
3548                 default:
3549                     return 1;
3550             }
3551             break;
3552 
3553         default:
3554             break;
3555     }
3556     return first - second;
3557 }
3558 
3559 /*!
3560  * \internal
3561  * \brief Ban a resource (or its clone if an anonymous instance) from all nodes
3562  *
3563  * \param[in,out] rsc  Resource to ban
3564  */
3565 static void
3566 ban_from_all_nodes(pcmk_resource_t *rsc)
     /* [previous][next][first][last][top][bottom][index][help] */
3567 {
3568     int score = -INFINITY;
3569     pcmk_resource_t *fail_rsc = rsc;
3570 
3571     if (fail_rsc->parent != NULL) {
3572         pcmk_resource_t *parent = uber_parent(fail_rsc);
3573 
3574         if (pe_rsc_is_anon_clone(parent)) {
3575             /* For anonymous clones, if an operation with on-fail=stop fails for
3576              * any instance, the entire clone must stop.
3577              */
3578             fail_rsc = parent;
3579         }
3580     }
3581 
3582     // Ban the resource from all nodes
3583     crm_notice("%s will not be started under current conditions", fail_rsc->id);
3584     if (fail_rsc->allowed_nodes != NULL) {
3585         g_hash_table_destroy(fail_rsc->allowed_nodes);
3586     }
3587     fail_rsc->allowed_nodes = pe__node_list2table(rsc->cluster->nodes);
3588     g_hash_table_foreach(fail_rsc->allowed_nodes, set_node_score, &score);
3589 }
3590 
3591 /*!
3592  * \internal
3593  * \brief Get configured failure handling and role after failure for an action
3594  *
3595  * \param[in,out] history    Unpacked action history entry
3596  * \param[out]    on_fail    Where to set configured failure handling
3597  * \param[out]    fail_role  Where to set to role after failure
3598  */
3599 static void
3600 unpack_failure_handling(struct action_history *history,
     /* [previous][next][first][last][top][bottom][index][help] */
3601                         enum action_fail_response *on_fail,
3602                         enum rsc_role_e *fail_role)
3603 {
3604     xmlNode *config = pcmk__find_action_config(history->rsc, history->task,
3605                                                history->interval_ms, true);
3606 
3607     GHashTable *meta = pcmk__unpack_action_meta(history->rsc, history->node,
3608                                                 history->task,
3609                                                 history->interval_ms, config);
3610 
3611     const char *on_fail_str = g_hash_table_lookup(meta, XML_OP_ATTR_ON_FAIL);
3612 
3613     *on_fail = pcmk__parse_on_fail(history->rsc, history->task,
3614                                    history->interval_ms, on_fail_str);
3615     *fail_role = pcmk__role_after_failure(history->rsc, history->task, *on_fail,
3616                                           meta);
3617     g_hash_table_destroy(meta);
3618 }
3619 
3620 /*!
3621  * \internal
3622  * \brief Update resource role, failure handling, etc., after a failed action
3623  *
3624  * \param[in,out] history         Parsed action result history
3625  * \param[in]     config_on_fail  Action failure handling from configuration
3626  * \param[in]     fail_role       Resource's role after failure of this action
3627  * \param[out]    last_failure    This will be set to the history XML
3628  * \param[in,out] on_fail         Actual handling of action result
3629  */
3630 static void
3631 unpack_rsc_op_failure(struct action_history *history,
     /* [previous][next][first][last][top][bottom][index][help] */
3632                       enum action_fail_response config_on_fail,
3633                       enum rsc_role_e fail_role, xmlNode **last_failure,
3634                       enum action_fail_response *on_fail)
3635 {
3636     bool is_probe = false;
3637     char *last_change_s = NULL;
3638 
3639     *last_failure = history->xml;
3640 
3641     is_probe = pcmk_xe_is_probe(history->xml);
3642     last_change_s = last_change_str(history->xml);
3643 
3644     if (!pcmk_is_set(history->rsc->cluster->flags, pcmk_sched_symmetric_cluster)
3645         && (history->exit_status == PCMK_OCF_NOT_INSTALLED)) {
3646         crm_trace("Unexpected result (%s%s%s) was recorded for "
3647                   "%s of %s on %s at %s " CRM_XS " exit-status=%d id=%s",
3648                   services_ocf_exitcode_str(history->exit_status),
3649                   (pcmk__str_empty(history->exit_reason)? "" : ": "),
3650                   pcmk__s(history->exit_reason, ""),
3651                   (is_probe? "probe" : history->task), history->rsc->id,
3652                   pe__node_name(history->node), last_change_s,
3653                   history->exit_status, history->id);
3654     } else {
3655         crm_warn("Unexpected result (%s%s%s) was recorded for "
3656                   "%s of %s on %s at %s " CRM_XS " exit-status=%d id=%s",
3657                  services_ocf_exitcode_str(history->exit_status),
3658                  (pcmk__str_empty(history->exit_reason)? "" : ": "),
3659                  pcmk__s(history->exit_reason, ""),
3660                  (is_probe? "probe" : history->task), history->rsc->id,
3661                  pe__node_name(history->node), last_change_s,
3662                  history->exit_status, history->id);
3663 
3664         if (is_probe && (history->exit_status != PCMK_OCF_OK)
3665             && (history->exit_status != PCMK_OCF_NOT_RUNNING)
3666             && (history->exit_status != PCMK_OCF_RUNNING_PROMOTED)) {
3667 
3668             /* A failed (not just unexpected) probe result could mean the user
3669              * didn't know resources will be probed even where they can't run.
3670              */
3671             crm_notice("If it is not possible for %s to run on %s, see "
3672                        "the resource-discovery option for location constraints",
3673                        history->rsc->id, pe__node_name(history->node));
3674         }
3675 
3676         record_failed_op(history);
3677     }
3678 
3679     free(last_change_s);
3680 
3681     if (cmp_on_fail(*on_fail, config_on_fail) < 0) {
3682         pe_rsc_trace(history->rsc, "on-fail %s -> %s for %s",
3683                      fail2text(*on_fail), fail2text(config_on_fail),
3684                      history->key);
3685         *on_fail = config_on_fail;
3686     }
3687 
3688     if (strcmp(history->task, PCMK_ACTION_STOP) == 0) {
3689         resource_location(history->rsc, history->node, -INFINITY,
3690                           "__stop_fail__", history->rsc->cluster);
3691 
3692     } else if (strcmp(history->task, PCMK_ACTION_MIGRATE_TO) == 0) {
3693         unpack_migrate_to_failure(history);
3694 
3695     } else if (strcmp(history->task, PCMK_ACTION_MIGRATE_FROM) == 0) {
3696         unpack_migrate_from_failure(history);
3697 
3698     } else if (strcmp(history->task, PCMK_ACTION_PROMOTE) == 0) {
3699         history->rsc->role = pcmk_role_promoted;
3700 
3701     } else if (strcmp(history->task, PCMK_ACTION_DEMOTE) == 0) {
3702         if (config_on_fail == pcmk_on_fail_block) {
3703             history->rsc->role = pcmk_role_promoted;
3704             pe__set_next_role(history->rsc, pcmk_role_stopped,
3705                               "demote with on-fail=block");
3706 
3707         } else if (history->exit_status == PCMK_OCF_NOT_RUNNING) {
3708             history->rsc->role = pcmk_role_stopped;
3709 
3710         } else {
3711             /* Staying in the promoted role would put the scheduler and
3712              * controller into a loop. Setting the role to unpromoted is not
3713              * dangerous because the resource will be stopped as part of
3714              * recovery, and any promotion will be ordered after that stop.
3715              */
3716             history->rsc->role = pcmk_role_unpromoted;
3717         }
3718     }
3719 
3720     if (is_probe && (history->exit_status == PCMK_OCF_NOT_INSTALLED)) {
3721         /* leave stopped */
3722         pe_rsc_trace(history->rsc, "Leaving %s stopped", history->rsc->id);
3723         history->rsc->role = pcmk_role_stopped;
3724 
3725     } else if (history->rsc->role < pcmk_role_started) {
3726         pe_rsc_trace(history->rsc, "Setting %s active", history->rsc->id);
3727         set_active(history->rsc);
3728     }
3729 
3730     pe_rsc_trace(history->rsc,
3731                  "Resource %s: role=%s, unclean=%s, on_fail=%s, fail_role=%s",
3732                  history->rsc->id, role2text(history->rsc->role),
3733                  pcmk__btoa(history->node->details->unclean),
3734                  fail2text(config_on_fail), role2text(fail_role));
3735 
3736     if ((fail_role != pcmk_role_started)
3737         && (history->rsc->next_role < fail_role)) {
3738         pe__set_next_role(history->rsc, fail_role, "failure");
3739     }
3740 
3741     if (fail_role == pcmk_role_stopped) {
3742         ban_from_all_nodes(history->rsc);
3743     }
3744 }
3745 
3746 /*!
3747  * \internal
3748  * \brief Block a resource with a failed action if it cannot be recovered
3749  *
3750  * If resource action is a failed stop and fencing is not possible, mark the
3751  * resource as unmanaged and blocked, since recovery cannot be done.
3752  *
3753  * \param[in,out] history  Parsed action history entry
3754  */
3755 static void
3756 block_if_unrecoverable(struct action_history *history)
     /* [previous][next][first][last][top][bottom][index][help] */
3757 {
3758     char *last_change_s = NULL;
3759 
3760     if (strcmp(history->task, PCMK_ACTION_STOP) != 0) {
3761         return; // All actions besides stop are always recoverable
3762     }
3763     if (pe_can_fence(history->node->details->data_set, history->node)) {
3764         return; // Failed stops are recoverable via fencing
3765     }
3766 
3767     last_change_s = last_change_str(history->xml);
3768     pe_proc_err("No further recovery can be attempted for %s "
3769                 "because %s on %s failed (%s%s%s) at %s "
3770                 CRM_XS " rc=%d id=%s",
3771                 history->rsc->id, history->task, pe__node_name(history->node),
3772                 services_ocf_exitcode_str(history->exit_status),
3773                 (pcmk__str_empty(history->exit_reason)? "" : ": "),
3774                 pcmk__s(history->exit_reason, ""),
3775                 last_change_s, history->exit_status, history->id);
3776 
3777     free(last_change_s);
3778 
3779     pe__clear_resource_flags(history->rsc, pcmk_rsc_managed);
3780     pe__set_resource_flags(history->rsc, pcmk_rsc_blocked);
3781 }
3782 
3783 /*!
3784  * \internal
3785  * \brief Update action history's execution status and why
3786  *
3787  * \param[in,out] history  Parsed action history entry
3788  * \param[out]    why      Where to store reason for update
3789  * \param[in]     value    New value
3790  * \param[in]     reason   Description of why value was changed
3791  */
3792 static inline void
3793 remap_because(struct action_history *history, const char **why, int value,
     /* [previous][next][first][last][top][bottom][index][help] */
3794               const char *reason)
3795 {
3796     if (history->execution_status != value) {
3797         history->execution_status = value;
3798         *why = reason;
3799     }
3800 }
3801 
3802 /*!
3803  * \internal
3804  * \brief Remap informational monitor results and operation status
3805  *
3806  * For the monitor results, certain OCF codes are for providing extended information
3807  * to the user about services that aren't yet failed but not entirely healthy either.
3808  * These must be treated as the "normal" result by Pacemaker.
3809  *
3810  * For operation status, the action result can be used to determine an appropriate
3811  * status for the purposes of responding to the action.  The status provided by the
3812  * executor is not directly usable since the executor does not know what was expected.
3813  *
3814  * \param[in,out] history  Parsed action history entry
3815  * \param[in,out] on_fail  What should be done about the result
3816  * \param[in]     expired  Whether result is expired
3817  *
3818  * \note If the result is remapped and the node is not shutting down or failed,
3819  *       the operation will be recorded in the scheduler data's list of failed
3820  *       operations to highlight it for the user.
3821  *
3822  * \note This may update the resource's current and next role.
3823  */
3824 static void
3825 remap_operation(struct action_history *history,
     /* [previous][next][first][last][top][bottom][index][help] */
3826                 enum action_fail_response *on_fail, bool expired)
3827 {
3828     bool is_probe = false;
3829     int orig_exit_status = history->exit_status;
3830     int orig_exec_status = history->execution_status;
3831     const char *why = NULL;
3832     const char *task = history->task;
3833 
3834     // Remap degraded results to their successful counterparts
3835     history->exit_status = pcmk__effective_rc(history->exit_status);
3836     if (history->exit_status != orig_exit_status) {
3837         why = "degraded result";
3838         if (!expired && (!history->node->details->shutdown
3839                          || history->node->details->online)) {
3840             record_failed_op(history);
3841         }
3842     }
3843 
3844     if (!pe_rsc_is_bundled(history->rsc)
3845         && pcmk_xe_mask_probe_failure(history->xml)
3846         && ((history->execution_status != PCMK_EXEC_DONE)
3847             || (history->exit_status != PCMK_OCF_NOT_RUNNING))) {
3848         history->execution_status = PCMK_EXEC_DONE;
3849         history->exit_status = PCMK_OCF_NOT_RUNNING;
3850         why = "equivalent probe result";
3851     }
3852 
3853     /* If the executor reported an execution status of anything but done or
3854      * error, consider that final. But for done or error, we know better whether
3855      * it should be treated as a failure or not, because we know the expected
3856      * result.
3857      */
3858     switch (history->execution_status) {
3859         case PCMK_EXEC_DONE:
3860         case PCMK_EXEC_ERROR:
3861             break;
3862 
3863         // These should be treated as node-fatal
3864         case PCMK_EXEC_NO_FENCE_DEVICE:
3865         case PCMK_EXEC_NO_SECRETS:
3866             remap_because(history, &why, PCMK_EXEC_ERROR_HARD,
3867                           "node-fatal error");
3868             goto remap_done;
3869 
3870         default:
3871             goto remap_done;
3872     }
3873 
3874     is_probe = pcmk_xe_is_probe(history->xml);
3875     if (is_probe) {
3876         task = "probe";
3877     }
3878 
3879     if (history->expected_exit_status < 0) {
3880         /* Pre-1.0 Pacemaker versions, and Pacemaker 1.1.6 or earlier with
3881          * Heartbeat 2.0.7 or earlier as the cluster layer, did not include the
3882          * expected exit status in the transition key, which (along with the
3883          * similar case of a corrupted transition key in the CIB) will be
3884          * reported to this function as -1. Pacemaker 2.0+ does not support
3885          * rolling upgrades from those versions or processing of saved CIB files
3886          * from those versions, so we do not need to care much about this case.
3887          */
3888         remap_because(history, &why, PCMK_EXEC_ERROR,
3889                       "obsolete history format");
3890         crm_warn("Expected result not found for %s on %s "
3891                  "(corrupt or obsolete CIB?)",
3892                  history->key, pe__node_name(history->node));
3893 
3894     } else if (history->exit_status == history->expected_exit_status) {
3895         remap_because(history, &why, PCMK_EXEC_DONE, "expected result");
3896 
3897     } else {
3898         remap_because(history, &why, PCMK_EXEC_ERROR, "unexpected result");
3899         pe_rsc_debug(history->rsc,
3900                      "%s on %s: expected %d (%s), got %d (%s%s%s)",
3901                      history->key, pe__node_name(history->node),
3902                      history->expected_exit_status,
3903                      services_ocf_exitcode_str(history->expected_exit_status),
3904                      history->exit_status,
3905                      services_ocf_exitcode_str(history->exit_status),
3906                      (pcmk__str_empty(history->exit_reason)? "" : ": "),
3907                      pcmk__s(history->exit_reason, ""));
3908     }
3909 
3910     switch (history->exit_status) {
3911         case PCMK_OCF_OK:
3912             if (is_probe
3913                 && (history->expected_exit_status == PCMK_OCF_NOT_RUNNING)) {
3914                 char *last_change_s = last_change_str(history->xml);
3915 
3916                 remap_because(history, &why, PCMK_EXEC_DONE, "probe");
3917                 pe_rsc_info(history->rsc, "Probe found %s active on %s at %s",
3918                             history->rsc->id, pe__node_name(history->node),
3919                             last_change_s);
3920                 free(last_change_s);
3921             }
3922             break;
3923 
3924         case PCMK_OCF_NOT_RUNNING:
3925             if (is_probe
3926                 || (history->expected_exit_status == history->exit_status)
3927                 || !pcmk_is_set(history->rsc->flags, pcmk_rsc_managed)) {
3928 
3929                 /* For probes, recurring monitors for the Stopped role, and
3930                  * unmanaged resources, "not running" is not considered a
3931                  * failure.
3932                  */
3933                 remap_because(history, &why, PCMK_EXEC_DONE, "exit status");
3934                 history->rsc->role = pcmk_role_stopped;
3935                 *on_fail = pcmk_on_fail_ignore;
3936                 pe__set_next_role(history->rsc, pcmk_role_unknown,
3937                                   "not running");
3938             }
3939             break;
3940 
3941         case PCMK_OCF_RUNNING_PROMOTED:
3942             if (is_probe
3943                 && (history->exit_status != history->expected_exit_status)) {
3944                 char *last_change_s = last_change_str(history->xml);
3945 
3946                 remap_because(history, &why, PCMK_EXEC_DONE, "probe");
3947                 pe_rsc_info(history->rsc,
3948                             "Probe found %s active and promoted on %s at %s",
3949                             history->rsc->id, pe__node_name(history->node),
3950                             last_change_s);
3951                 free(last_change_s);
3952             }
3953             if (!expired
3954                 || (history->exit_status == history->expected_exit_status)) {
3955                 history->rsc->role = pcmk_role_promoted;
3956             }
3957             break;
3958 
3959         case PCMK_OCF_FAILED_PROMOTED:
3960             if (!expired) {
3961                 history->rsc->role = pcmk_role_promoted;
3962             }
3963             remap_because(history, &why, PCMK_EXEC_ERROR, "exit status");
3964             break;
3965 
3966         case PCMK_OCF_NOT_CONFIGURED:
3967             remap_because(history, &why, PCMK_EXEC_ERROR_FATAL, "exit status");
3968             break;
3969 
3970         case PCMK_OCF_UNIMPLEMENT_FEATURE:
3971             {
3972                 guint interval_ms = 0;
3973                 crm_element_value_ms(history->xml, XML_LRM_ATTR_INTERVAL_MS,
3974                                      &interval_ms);
3975 
3976                 if (interval_ms == 0) {
3977                     if (!expired) {
3978                         block_if_unrecoverable(history);
3979                     }
3980                     remap_because(history, &why, PCMK_EXEC_ERROR_HARD,
3981                                   "exit status");
3982                 } else {
3983                     remap_because(history, &why, PCMK_EXEC_NOT_SUPPORTED,
3984                                   "exit status");
3985                 }
3986             }
3987             break;
3988 
3989         case PCMK_OCF_NOT_INSTALLED:
3990         case PCMK_OCF_INVALID_PARAM:
3991         case PCMK_OCF_INSUFFICIENT_PRIV:
3992             if (!expired) {
3993                 block_if_unrecoverable(history);
3994             }
3995             remap_because(history, &why, PCMK_EXEC_ERROR_HARD, "exit status");
3996             break;
3997 
3998         default:
3999             if (history->execution_status == PCMK_EXEC_DONE) {
4000                 char *last_change_s = last_change_str(history->xml);
4001 
4002                 crm_info("Treating unknown exit status %d from %s of %s "
4003                          "on %s at %s as failure",
4004                          history->exit_status, task, history->rsc->id,
4005                          pe__node_name(history->node), last_change_s);
4006                 remap_because(history, &why, PCMK_EXEC_ERROR,
4007                               "unknown exit status");
4008                 free(last_change_s);
4009             }
4010             break;
4011     }
4012 
4013 remap_done:
4014     if (why != NULL) {
4015         pe_rsc_trace(history->rsc,
4016                      "Remapped %s result from [%s: %s] to [%s: %s] "
4017                      "because of %s",
4018                      history->key, pcmk_exec_status_str(orig_exec_status),
4019                      crm_exit_str(orig_exit_status),
4020                      pcmk_exec_status_str(history->execution_status),
4021                      crm_exit_str(history->exit_status), why);
4022     }
4023 }
4024 
4025 // return TRUE if start or monitor last failure but parameters changed
4026 static bool
4027 should_clear_for_param_change(const xmlNode *xml_op, const char *task,
     /* [previous][next][first][last][top][bottom][index][help] */
4028                               pcmk_resource_t *rsc, pcmk_node_t *node)
4029 {
4030     if (pcmk__str_any_of(task, PCMK_ACTION_START, PCMK_ACTION_MONITOR, NULL)) {
4031         if (pe__bundle_needs_remote_name(rsc)) {
4032             /* We haven't allocated resources yet, so we can't reliably
4033              * substitute addr parameters for the REMOTE_CONTAINER_HACK.
4034              * When that's needed, defer the check until later.
4035              */
4036             pe__add_param_check(xml_op, rsc, node, pcmk__check_last_failure,
4037                                 rsc->cluster);
4038 
4039         } else {
4040             op_digest_cache_t *digest_data = NULL;
4041 
4042             digest_data = rsc_action_digest_cmp(rsc, xml_op, node,
4043                                                 rsc->cluster);
4044             switch (digest_data->rc) {
4045                 case pcmk__digest_unknown:
4046                     crm_trace("Resource %s history entry %s on %s"
4047                               " has no digest to compare",
4048                               rsc->id, pe__xe_history_key(xml_op),
4049                               node->details->id);
4050                     break;
4051                 case pcmk__digest_match:
4052                     break;
4053                 default:
4054                     return TRUE;
4055             }
4056         }
4057     }
4058     return FALSE;
4059 }
4060 
4061 // Order action after fencing of remote node, given connection rsc
4062 static void
4063 order_after_remote_fencing(pcmk_action_t *action, pcmk_resource_t *remote_conn,
     /* [previous][next][first][last][top][bottom][index][help] */
4064                            pcmk_scheduler_t *scheduler)
4065 {
4066     pcmk_node_t *remote_node = pe_find_node(scheduler->nodes, remote_conn->id);
4067 
4068     if (remote_node) {
4069         pcmk_action_t *fence = pe_fence_op(remote_node, NULL, TRUE, NULL,
4070                                            FALSE, scheduler);
4071 
4072         order_actions(fence, action, pcmk__ar_first_implies_then);
4073     }
4074 }
4075 
4076 static bool
4077 should_ignore_failure_timeout(const pcmk_resource_t *rsc, const char *task,
     /* [previous][next][first][last][top][bottom][index][help] */
4078                               guint interval_ms, bool is_last_failure)
4079 {
4080     /* Clearing failures of recurring monitors has special concerns. The
4081      * executor reports only changes in the monitor result, so if the
4082      * monitor is still active and still getting the same failure result,
4083      * that will go undetected after the failure is cleared.
4084      *
4085      * Also, the operation history will have the time when the recurring
4086      * monitor result changed to the given code, not the time when the
4087      * result last happened.
4088      *
4089      * @TODO We probably should clear such failures only when the failure
4090      * timeout has passed since the last occurrence of the failed result.
4091      * However we don't record that information. We could maybe approximate
4092      * that by clearing only if there is a more recent successful monitor or
4093      * stop result, but we don't even have that information at this point
4094      * since we are still unpacking the resource's operation history.
4095      *
4096      * This is especially important for remote connection resources with a
4097      * reconnect interval, so in that case, we skip clearing failures
4098      * if the remote node hasn't been fenced.
4099      */
4100     if (rsc->remote_reconnect_ms
4101         && pcmk_is_set(rsc->cluster->flags, pcmk_sched_fencing_enabled)
4102         && (interval_ms != 0)
4103         && pcmk__str_eq(task, PCMK_ACTION_MONITOR, pcmk__str_casei)) {
4104 
4105         pcmk_node_t *remote_node = pe_find_node(rsc->cluster->nodes, rsc->id);
4106 
4107         if (remote_node && !remote_node->details->remote_was_fenced) {
4108             if (is_last_failure) {
4109                 crm_info("Waiting to clear monitor failure for remote node %s"
4110                          " until fencing has occurred", rsc->id);
4111             }
4112             return TRUE;
4113         }
4114     }
4115     return FALSE;
4116 }
4117 
4118 /*!
4119  * \internal
4120  * \brief Check operation age and schedule failure clearing when appropriate
4121  *
4122  * This function has two distinct purposes. The first is to check whether an
4123  * operation history entry is expired (i.e. the resource has a failure timeout,
4124  * the entry is older than the timeout, and the resource either has no fail
4125  * count or its fail count is entirely older than the timeout). The second is to
4126  * schedule fail count clearing when appropriate (i.e. the operation is expired
4127  * and either the resource has an expired fail count or the operation is a
4128  * last_failure for a remote connection resource with a reconnect interval,
4129  * or the operation is a last_failure for a start or monitor operation and the
4130  * resource's parameters have changed since the operation).
4131  *
4132  * \param[in,out] history  Parsed action result history
4133  *
4134  * \return true if operation history entry is expired, otherwise false
4135  */
4136 static bool
4137 check_operation_expiry(struct action_history *history)
     /* [previous][next][first][last][top][bottom][index][help] */
4138 {
4139     bool expired = false;
4140     bool is_last_failure = pcmk__ends_with(history->id, "_last_failure_0");
4141     time_t last_run = 0;
4142     int unexpired_fail_count = 0;
4143     const char *clear_reason = NULL;
4144 
4145     if (history->execution_status == PCMK_EXEC_NOT_INSTALLED) {
4146         pe_rsc_trace(history->rsc,
4147                      "Resource history entry %s on %s is not expired: "
4148                      "Not Installed does not expire",
4149                      history->id, pe__node_name(history->node));
4150         return false; // "Not installed" must always be cleared manually
4151     }
4152 
4153     if ((history->rsc->failure_timeout > 0)
4154         && (crm_element_value_epoch(history->xml, XML_RSC_OP_LAST_CHANGE,
4155                                     &last_run) == 0)) {
4156 
4157         // Resource has a failure-timeout, and history entry has a timestamp
4158 
4159         time_t now = get_effective_time(history->rsc->cluster);
4160         time_t last_failure = 0;
4161 
4162         // Is this particular operation history older than the failure timeout?
4163         if ((now >= (last_run + history->rsc->failure_timeout))
4164             && !should_ignore_failure_timeout(history->rsc, history->task,
4165                                               history->interval_ms,
4166                                               is_last_failure)) {
4167             expired = true;
4168         }
4169 
4170         // Does the resource as a whole have an unexpired fail count?
4171         unexpired_fail_count = pe_get_failcount(history->node, history->rsc,
4172                                                 &last_failure,
4173                                                 pcmk__fc_effective,
4174                                                 history->xml);
4175 
4176         // Update scheduler recheck time according to *last* failure
4177         crm_trace("%s@%lld is %sexpired @%lld with unexpired_failures=%d timeout=%ds"
4178                   " last-failure@%lld",
4179                   history->id, (long long) last_run, (expired? "" : "not "),
4180                   (long long) now, unexpired_fail_count,
4181                   history->rsc->failure_timeout, (long long) last_failure);
4182         last_failure += history->rsc->failure_timeout + 1;
4183         if (unexpired_fail_count && (now < last_failure)) {
4184             pe__update_recheck_time(last_failure, history->rsc->cluster,
4185                                     "fail count expiration");
4186         }
4187     }
4188 
4189     if (expired) {
4190         if (pe_get_failcount(history->node, history->rsc, NULL,
4191                              pcmk__fc_default, history->xml)) {
4192             // There is a fail count ignoring timeout
4193 
4194             if (unexpired_fail_count == 0) {
4195                 // There is no fail count considering timeout
4196                 clear_reason = "it expired";
4197 
4198             } else {
4199                 /* This operation is old, but there is an unexpired fail count.
4200                  * In a properly functioning cluster, this should only be
4201                  * possible if this operation is not a failure (otherwise the
4202                  * fail count should be expired too), so this is really just a
4203                  * failsafe.
4204                  */
4205                 pe_rsc_trace(history->rsc,
4206                              "Resource history entry %s on %s is not expired: "
4207                              "Unexpired fail count",
4208                              history->id, pe__node_name(history->node));
4209                 expired = false;
4210             }
4211 
4212         } else if (is_last_failure
4213                    && (history->rsc->remote_reconnect_ms != 0)) {
4214             /* Clear any expired last failure when reconnect interval is set,
4215              * even if there is no fail count.
4216              */
4217             clear_reason = "reconnect interval is set";
4218         }
4219     }
4220 
4221     if (!expired && is_last_failure
4222         && should_clear_for_param_change(history->xml, history->task,
4223                                          history->rsc, history->node)) {
4224         clear_reason = "resource parameters have changed";
4225     }
4226 
4227     if (clear_reason != NULL) {
4228         pcmk_action_t *clear_op = NULL;
4229 
4230         // Schedule clearing of the fail count
4231         clear_op = pe__clear_failcount(history->rsc, history->node,
4232                                        clear_reason, history->rsc->cluster);
4233 
4234         if (pcmk_is_set(history->rsc->cluster->flags,
4235                         pcmk_sched_fencing_enabled)
4236             && (history->rsc->remote_reconnect_ms != 0)) {
4237             /* If we're clearing a remote connection due to a reconnect
4238              * interval, we want to wait until any scheduled fencing
4239              * completes.
4240              *
4241              * We could limit this to remote_node->details->unclean, but at
4242              * this point, that's always true (it won't be reliable until
4243              * after unpack_node_history() is done).
4244              */
4245             crm_info("Clearing %s failure will wait until any scheduled "
4246                      "fencing of %s completes",
4247                      history->task, history->rsc->id);
4248             order_after_remote_fencing(clear_op, history->rsc,
4249                                        history->rsc->cluster);
4250         }
4251     }
4252 
4253     if (expired && (history->interval_ms == 0)
4254         && pcmk__str_eq(history->task, PCMK_ACTION_MONITOR, pcmk__str_none)) {
4255         switch (history->exit_status) {
4256             case PCMK_OCF_OK:
4257             case PCMK_OCF_NOT_RUNNING:
4258             case PCMK_OCF_RUNNING_PROMOTED:
4259             case PCMK_OCF_DEGRADED:
4260             case PCMK_OCF_DEGRADED_PROMOTED:
4261                 // Don't expire probes that return these values
4262                 pe_rsc_trace(history->rsc,
4263                              "Resource history entry %s on %s is not expired: "
4264                              "Probe result",
4265                              history->id, pe__node_name(history->node));
4266                 expired = false;
4267                 break;
4268         }
4269     }
4270 
4271     return expired;
4272 }
4273 
4274 int
4275 pe__target_rc_from_xml(const xmlNode *xml_op)
     /* [previous][next][first][last][top][bottom][index][help] */
4276 {
4277     int target_rc = 0;
4278     const char *key = crm_element_value(xml_op, XML_ATTR_TRANSITION_KEY);
4279 
4280     if (key == NULL) {
4281         return -1;
4282     }
4283     decode_transition_key(key, NULL, NULL, NULL, &target_rc);
4284     return target_rc;
4285 }
4286 
4287 /*!
4288  * \internal
4289  * \brief Update a resource's state for an action result
4290  *
4291  * \param[in,out] history       Parsed action history entry
4292  * \param[in]     exit_status   Exit status to base new state on
4293  * \param[in]     last_failure  Resource's last_failure entry, if known
4294  * \param[in,out] on_fail       Resource's current failure handling
4295  */
4296 static void
4297 update_resource_state(struct action_history *history, int exit_status,
     /* [previous][next][first][last][top][bottom][index][help] */
4298                       const xmlNode *last_failure,
4299                       enum action_fail_response *on_fail)
4300 {
4301     bool clear_past_failure = false;
4302 
4303     if ((exit_status == PCMK_OCF_NOT_INSTALLED)
4304         || (!pe_rsc_is_bundled(history->rsc)
4305             && pcmk_xe_mask_probe_failure(history->xml))) {
4306         history->rsc->role = pcmk_role_stopped;
4307 
4308     } else if (exit_status == PCMK_OCF_NOT_RUNNING) {
4309         clear_past_failure = true;
4310 
4311     } else if (pcmk__str_eq(history->task, PCMK_ACTION_MONITOR,
4312                             pcmk__str_none)) {
4313         if ((last_failure != NULL)
4314             && pcmk__str_eq(history->key, pe__xe_history_key(last_failure),
4315                             pcmk__str_none)) {
4316             clear_past_failure = true;
4317         }
4318         if (history->rsc->role < pcmk_role_started) {
4319             set_active(history->rsc);
4320         }
4321 
4322     } else if (pcmk__str_eq(history->task, PCMK_ACTION_START, pcmk__str_none)) {
4323         history->rsc->role = pcmk_role_started;
4324         clear_past_failure = true;
4325 
4326     } else if (pcmk__str_eq(history->task, PCMK_ACTION_STOP, pcmk__str_none)) {
4327         history->rsc->role = pcmk_role_stopped;
4328         clear_past_failure = true;
4329 
4330     } else if (pcmk__str_eq(history->task, PCMK_ACTION_PROMOTE,
4331                             pcmk__str_none)) {
4332         history->rsc->role = pcmk_role_promoted;
4333         clear_past_failure = true;
4334 
4335     } else if (pcmk__str_eq(history->task, PCMK_ACTION_DEMOTE,
4336                             pcmk__str_none)) {
4337         if (*on_fail == pcmk_on_fail_demote) {
4338             // Demote clears an error only if on-fail=demote
4339             clear_past_failure = true;
4340         }
4341         history->rsc->role = pcmk_role_unpromoted;
4342 
4343     } else if (pcmk__str_eq(history->task, PCMK_ACTION_MIGRATE_FROM,
4344                             pcmk__str_none)) {
4345         history->rsc->role = pcmk_role_started;
4346         clear_past_failure = true;
4347 
4348     } else if (pcmk__str_eq(history->task, PCMK_ACTION_MIGRATE_TO,
4349                             pcmk__str_none)) {
4350         unpack_migrate_to_success(history);
4351 
4352     } else if (history->rsc->role < pcmk_role_started) {
4353         pe_rsc_trace(history->rsc, "%s active on %s",
4354                      history->rsc->id, pe__node_name(history->node));
4355         set_active(history->rsc);
4356     }
4357 
4358     if (!clear_past_failure) {
4359         return;
4360     }
4361 
4362     switch (*on_fail) {
4363         case pcmk_on_fail_stop:
4364         case pcmk_on_fail_ban:
4365         case pcmk_on_fail_standby_node:
4366         case pcmk_on_fail_fence_node:
4367             pe_rsc_trace(history->rsc,
4368                          "%s (%s) is not cleared by a completed %s",
4369                          history->rsc->id, fail2text(*on_fail), history->task);
4370             break;
4371 
4372         case pcmk_on_fail_block:
4373         case pcmk_on_fail_ignore:
4374         case pcmk_on_fail_demote:
4375         case pcmk_on_fail_restart:
4376         case pcmk_on_fail_restart_container:
4377             *on_fail = pcmk_on_fail_ignore;
4378             pe__set_next_role(history->rsc, pcmk_role_unknown,
4379                               "clear past failures");
4380             break;
4381 
4382         case pcmk_on_fail_reset_remote:
4383             if (history->rsc->remote_reconnect_ms == 0) {
4384                 /* With no reconnect interval, the connection is allowed to
4385                  * start again after the remote node is fenced and
4386                  * completely stopped. (With a reconnect interval, we wait
4387                  * for the failure to be cleared entirely before attempting
4388                  * to reconnect.)
4389                  */
4390                 *on_fail = pcmk_on_fail_ignore;
4391                 pe__set_next_role(history->rsc, pcmk_role_unknown,
4392                                   "clear past failures and reset remote");
4393             }
4394             break;
4395     }
4396 }
4397 
4398 /*!
4399  * \internal
4400  * \brief Check whether a given history entry matters for resource state
4401  *
4402  * \param[in] history  Parsed action history entry
4403  *
4404  * \return true if action can affect resource state, otherwise false
4405  */
4406 static inline bool
4407 can_affect_state(struct action_history *history)
     /* [previous][next][first][last][top][bottom][index][help] */
4408 {
4409 #if 0
4410     /* @COMPAT It might be better to parse only actions we know we're interested
4411      * in, rather than exclude a couple we don't. However that would be a
4412      * behavioral change that should be done at a major or minor series release.
4413      * Currently, unknown operations can affect whether a resource is considered
4414      * active and/or failed.
4415      */
4416      return pcmk__str_any_of(history->task, PCMK_ACTION_MONITOR,
4417                              PCMK_ACTION_START, PCMK_ACTION_STOP,
4418                              PCMK_ACTION_PROMOTE, PCMK_ACTION_DEMOTE,
4419                              PCMK_ACTION_MIGRATE_TO, PCMK_ACTION_MIGRATE_FROM,
4420                              "asyncmon", NULL);
4421 #else
4422      return !pcmk__str_any_of(history->task, PCMK_ACTION_NOTIFY,
4423                               PCMK_ACTION_META_DATA, NULL);
4424 #endif
4425 }
4426 
4427 /*!
4428  * \internal
4429  * \brief Unpack execution/exit status and exit reason from a history entry
4430  *
4431  * \param[in,out] history  Action history entry to unpack
4432  *
4433  * \return Standard Pacemaker return code
4434  */
4435 static int
4436 unpack_action_result(struct action_history *history)
     /* [previous][next][first][last][top][bottom][index][help] */
4437 {
4438     if ((crm_element_value_int(history->xml, XML_LRM_ATTR_OPSTATUS,
4439                                &(history->execution_status)) < 0)
4440         || (history->execution_status < PCMK_EXEC_PENDING)
4441         || (history->execution_status > PCMK_EXEC_MAX)
4442         || (history->execution_status == PCMK_EXEC_CANCELLED)) {
4443         crm_err("Ignoring resource history entry %s for %s on %s "
4444                 "with invalid " XML_LRM_ATTR_OPSTATUS " '%s'",
4445                 history->id, history->rsc->id, pe__node_name(history->node),
4446                 pcmk__s(crm_element_value(history->xml, XML_LRM_ATTR_OPSTATUS),
4447                         ""));
4448         return pcmk_rc_unpack_error;
4449     }
4450     if ((crm_element_value_int(history->xml, XML_LRM_ATTR_RC,
4451                                &(history->exit_status)) < 0)
4452         || (history->exit_status < 0) || (history->exit_status > CRM_EX_MAX)) {
4453 #if 0
4454         /* @COMPAT We should ignore malformed entries, but since that would
4455          * change behavior, it should be done at a major or minor series
4456          * release.
4457          */
4458         crm_err("Ignoring resource history entry %s for %s on %s "
4459                 "with invalid " XML_LRM_ATTR_RC " '%s'",
4460                 history->id, history->rsc->id, pe__node_name(history->node),
4461                 pcmk__s(crm_element_value(history->xml, XML_LRM_ATTR_RC),
4462                         ""));
4463         return pcmk_rc_unpack_error;
4464 #else
4465         history->exit_status = CRM_EX_ERROR;
4466 #endif
4467     }
4468     history->exit_reason = crm_element_value(history->xml,
4469                                              XML_LRM_ATTR_EXIT_REASON);
4470     return pcmk_rc_ok;
4471 }
4472 
4473 /*!
4474  * \internal
4475  * \brief Process an action history entry whose result expired
4476  *
4477  * \param[in,out] history           Parsed action history entry
4478  * \param[in]     orig_exit_status  Action exit status before remapping
4479  *
4480  * \return Standard Pacemaker return code (in particular, pcmk_rc_ok means the
4481  *         entry needs no further processing)
4482  */
4483 static int
4484 process_expired_result(struct action_history *history, int orig_exit_status)
     /* [previous][next][first][last][top][bottom][index][help] */
4485 {
4486     if (!pe_rsc_is_bundled(history->rsc)
4487         && pcmk_xe_mask_probe_failure(history->xml)
4488         && (orig_exit_status != history->expected_exit_status)) {
4489 
4490         if (history->rsc->role <= pcmk_role_stopped) {
4491             history->rsc->role = pcmk_role_unknown;
4492         }
4493         crm_trace("Ignoring resource history entry %s for probe of %s on %s: "
4494                   "Masked failure expired",
4495                   history->id, history->rsc->id,
4496                   pe__node_name(history->node));
4497         return pcmk_rc_ok;
4498     }
4499 
4500     if (history->exit_status == history->expected_exit_status) {
4501         return pcmk_rc_undetermined; // Only failures expire
4502     }
4503 
4504     if (history->interval_ms == 0) {
4505         crm_notice("Ignoring resource history entry %s for %s of %s on %s: "
4506                    "Expired failure",
4507                    history->id, history->task, history->rsc->id,
4508                    pe__node_name(history->node));
4509         return pcmk_rc_ok;
4510     }
4511 
4512     if (history->node->details->online && !history->node->details->unclean) {
4513         /* Reschedule the recurring action. schedule_cancel() won't work at
4514          * this stage, so as a hacky workaround, forcibly change the restart
4515          * digest so pcmk__check_action_config() does what we want later.
4516          *
4517          * @TODO We should skip this if there is a newer successful monitor.
4518          *       Also, this causes rescheduling only if the history entry
4519          *       has an op-digest (which the expire-non-blocked-failure
4520          *       scheduler regression test doesn't, but that may not be a
4521          *       realistic scenario in production).
4522          */
4523         crm_notice("Rescheduling %s-interval %s of %s on %s "
4524                    "after failure expired",
4525                    pcmk__readable_interval(history->interval_ms), history->task,
4526                    history->rsc->id, pe__node_name(history->node));
4527         crm_xml_add(history->xml, XML_LRM_ATTR_RESTART_DIGEST,
4528                     "calculated-failure-timeout");
4529         return pcmk_rc_ok;
4530     }
4531 
4532     return pcmk_rc_undetermined;
4533 }
4534 
4535 /*!
4536  * \internal
4537  * \brief Process a masked probe failure
4538  *
4539  * \param[in,out] history           Parsed action history entry
4540  * \param[in]     orig_exit_status  Action exit status before remapping
4541  * \param[in]     last_failure      Resource's last_failure entry, if known
4542  * \param[in,out] on_fail           Resource's current failure handling
4543  */
4544 static void
4545 mask_probe_failure(struct action_history *history, int orig_exit_status,
     /* [previous][next][first][last][top][bottom][index][help] */
4546                    const xmlNode *last_failure,
4547                    enum action_fail_response *on_fail)
4548 {
4549     pcmk_resource_t *ban_rsc = history->rsc;
4550 
4551     if (!pcmk_is_set(history->rsc->flags, pcmk_rsc_unique)) {
4552         ban_rsc = uber_parent(history->rsc);
4553     }
4554 
4555     crm_notice("Treating probe result '%s' for %s on %s as 'not running'",
4556                services_ocf_exitcode_str(orig_exit_status), history->rsc->id,
4557                pe__node_name(history->node));
4558     update_resource_state(history, history->expected_exit_status, last_failure,
4559                           on_fail);
4560     crm_xml_add(history->xml, XML_ATTR_UNAME, history->node->details->uname);
4561 
4562     record_failed_op(history);
4563     resource_location(ban_rsc, history->node, -INFINITY, "masked-probe-failure",
4564                       history->rsc->cluster);
4565 }
4566 
4567 /*!
4568  * \internal Check whether a given failure is for a given pending action
4569  *
4570  * \param[in] history       Parsed history entry for pending action
4571  * \param[in] last_failure  Resource's last_failure entry, if known
4572  *
4573  * \return true if \p last_failure is failure of pending action in \p history,
4574  *         otherwise false
4575  * \note Both \p history and \p last_failure must come from the same
4576  *       lrm_resource block, as node and resource are assumed to be the same.
4577  */
4578 static bool
4579 failure_is_newer(const struct action_history *history,
     /* [previous][next][first][last][top][bottom][index][help] */
4580                  const xmlNode *last_failure)
4581 {
4582     guint failure_interval_ms = 0U;
4583     long long failure_change = 0LL;
4584     long long this_change = 0LL;
4585 
4586     if (last_failure == NULL) {
4587         return false; // Resource has no last_failure entry
4588     }
4589 
4590     if (!pcmk__str_eq(history->task,
4591                       crm_element_value(last_failure, XML_LRM_ATTR_TASK),
4592                       pcmk__str_none)) {
4593         return false; // last_failure is for different action
4594     }
4595 
4596     if ((crm_element_value_ms(last_failure, XML_LRM_ATTR_INTERVAL_MS,
4597                               &failure_interval_ms) != pcmk_ok)
4598         || (history->interval_ms != failure_interval_ms)) {
4599         return false; // last_failure is for action with different interval
4600     }
4601 
4602     if ((pcmk__scan_ll(crm_element_value(history->xml, XML_RSC_OP_LAST_CHANGE),
4603                        &this_change, 0LL) != pcmk_rc_ok)
4604         || (pcmk__scan_ll(crm_element_value(last_failure,
4605                                             XML_RSC_OP_LAST_CHANGE),
4606                           &failure_change, 0LL) != pcmk_rc_ok)
4607         || (failure_change < this_change)) {
4608         return false; // Failure is not known to be newer
4609     }
4610 
4611     return true;
4612 }
4613 
4614 /*!
4615  * \internal
4616  * \brief Update a resource's role etc. for a pending action
4617  *
4618  * \param[in,out] history       Parsed history entry for pending action
4619  * \param[in]     last_failure  Resource's last_failure entry, if known
4620  */
4621 static void
4622 process_pending_action(struct action_history *history,
     /* [previous][next][first][last][top][bottom][index][help] */
4623                        const xmlNode *last_failure)
4624 {
4625     /* For recurring monitors, a failure is recorded only in RSC_last_failure_0,
4626      * and there might be a RSC_monitor_INTERVAL entry with the last successful
4627      * or pending result.
4628      *
4629      * If last_failure contains the failure of the pending recurring monitor
4630      * we're processing here, and is newer, the action is no longer pending.
4631      * (Pending results have call ID -1, which sorts last, so the last failure
4632      * if any should be known.)
4633      */
4634     if (failure_is_newer(history, last_failure)) {
4635         return;
4636     }
4637 
4638     if (strcmp(history->task, PCMK_ACTION_START) == 0) {
4639         pe__set_resource_flags(history->rsc, pcmk_rsc_start_pending);
4640         set_active(history->rsc);
4641 
4642     } else if (strcmp(history->task, PCMK_ACTION_PROMOTE) == 0) {
4643         history->rsc->role = pcmk_role_promoted;
4644 
4645     } else if ((strcmp(history->task, PCMK_ACTION_MIGRATE_TO) == 0)
4646                && history->node->details->unclean) {
4647         /* A migrate_to action is pending on a unclean source, so force a stop
4648          * on the target.
4649          */
4650         const char *migrate_target = NULL;
4651         pcmk_node_t *target = NULL;
4652 
4653         migrate_target = crm_element_value(history->xml,
4654                                            XML_LRM_ATTR_MIGRATE_TARGET);
4655         target = pe_find_node(history->rsc->cluster->nodes, migrate_target);
4656         if (target != NULL) {
4657             stop_action(history->rsc, target, FALSE);
4658         }
4659     }
4660 
4661     if (history->rsc->pending_task != NULL) {
4662         /* There should never be multiple pending actions, but as a failsafe,
4663          * just remember the first one processed for display purposes.
4664          */
4665         return;
4666     }
4667 
4668     if (pcmk_is_probe(history->task, history->interval_ms)) {
4669         /* Pending probes are currently never displayed, even if pending
4670          * operations are requested. If we ever want to change that,
4671          * enable the below and the corresponding part of
4672          * native.c:native_pending_task().
4673          */
4674 #if 0
4675         history->rsc->pending_task = strdup("probe");
4676         history->rsc->pending_node = history->node;
4677 #endif
4678     } else {
4679         history->rsc->pending_task = strdup(history->task);
4680         history->rsc->pending_node = history->node;
4681     }
4682 }
4683 
4684 static void
4685 unpack_rsc_op(pcmk_resource_t *rsc, pcmk_node_t *node, xmlNode *xml_op,
     /* [previous][next][first][last][top][bottom][index][help] */
4686               xmlNode **last_failure, enum action_fail_response *on_fail)
4687 {
4688     int old_rc = 0;
4689     bool expired = false;
4690     pcmk_resource_t *parent = rsc;
4691     enum rsc_role_e fail_role = pcmk_role_unknown;
4692     enum action_fail_response failure_strategy = pcmk_on_fail_restart;
4693 
4694     struct action_history history = {
4695         .rsc = rsc,
4696         .node = node,
4697         .xml = xml_op,
4698         .execution_status = PCMK_EXEC_UNKNOWN,
4699     };
4700 
4701     CRM_CHECK(rsc && node && xml_op, return);
4702 
4703     history.id = ID(xml_op);
4704     if (history.id == NULL) {
4705         crm_err("Ignoring resource history entry for %s on %s without ID",
4706                 rsc->id, pe__node_name(node));
4707         return;
4708     }
4709 
4710     // Task and interval
4711     history.task = crm_element_value(xml_op, XML_LRM_ATTR_TASK);
4712     if (history.task == NULL) {
4713         crm_err("Ignoring resource history entry %s for %s on %s without "
4714                 XML_LRM_ATTR_TASK, history.id, rsc->id, pe__node_name(node));
4715         return;
4716     }
4717     crm_element_value_ms(xml_op, XML_LRM_ATTR_INTERVAL_MS,
4718                          &(history.interval_ms));
4719     if (!can_affect_state(&history)) {
4720         pe_rsc_trace(rsc,
4721                      "Ignoring resource history entry %s for %s on %s "
4722                      "with irrelevant action '%s'",
4723                      history.id, rsc->id, pe__node_name(node), history.task);
4724         return;
4725     }
4726 
4727     if (unpack_action_result(&history) != pcmk_rc_ok) {
4728         return; // Error already logged
4729     }
4730 
4731     history.expected_exit_status = pe__target_rc_from_xml(xml_op);
4732     history.key = pe__xe_history_key(xml_op);
4733     crm_element_value_int(xml_op, XML_LRM_ATTR_CALLID, &(history.call_id));
4734 
4735     pe_rsc_trace(rsc, "Unpacking %s (%s call %d on %s): %s (%s)",
4736                  history.id, history.task, history.call_id, pe__node_name(node),
4737                  pcmk_exec_status_str(history.execution_status),
4738                  crm_exit_str(history.exit_status));
4739 
4740     if (node->details->unclean) {
4741         pe_rsc_trace(rsc,
4742                      "%s is running on %s, which is unclean (further action "
4743                      "depends on value of stop's on-fail attribute)",
4744                      rsc->id, pe__node_name(node));
4745     }
4746 
4747     expired = check_operation_expiry(&history);
4748     old_rc = history.exit_status;
4749 
4750     remap_operation(&history, on_fail, expired);
4751 
4752     if (expired && (process_expired_result(&history, old_rc) == pcmk_rc_ok)) {
4753         goto done;
4754     }
4755 
4756     if (!pe_rsc_is_bundled(rsc) && pcmk_xe_mask_probe_failure(xml_op)) {
4757         mask_probe_failure(&history, old_rc, *last_failure, on_fail);
4758         goto done;
4759     }
4760 
4761     if (!pcmk_is_set(rsc->flags, pcmk_rsc_unique)) {
4762         parent = uber_parent(rsc);
4763     }
4764 
4765     switch (history.execution_status) {
4766         case PCMK_EXEC_PENDING:
4767             process_pending_action(&history, *last_failure);
4768             goto done;
4769 
4770         case PCMK_EXEC_DONE:
4771             update_resource_state(&history, history.exit_status, *last_failure,
4772                                   on_fail);
4773             goto done;
4774 
4775         case PCMK_EXEC_NOT_INSTALLED:
4776             unpack_failure_handling(&history, &failure_strategy, &fail_role);
4777             if (failure_strategy == pcmk_on_fail_ignore) {
4778                 crm_warn("Cannot ignore failed %s of %s on %s: "
4779                          "Resource agent doesn't exist "
4780                          CRM_XS " status=%d rc=%d id=%s",
4781                          history.task, rsc->id, pe__node_name(node),
4782                          history.execution_status, history.exit_status,
4783                          history.id);
4784                 /* Also for printing it as "FAILED" by marking it as
4785                  * pcmk_rsc_failed later
4786                  */
4787                 *on_fail = pcmk_on_fail_ban;
4788             }
4789             resource_location(parent, node, -INFINITY, "hard-error",
4790                               rsc->cluster);
4791             unpack_rsc_op_failure(&history, failure_strategy, fail_role,
4792                                   last_failure, on_fail);
4793             goto done;
4794 
4795         case PCMK_EXEC_NOT_CONNECTED:
4796             if (pe__is_guest_or_remote_node(node)
4797                 && pcmk_is_set(node->details->remote_rsc->flags,
4798                                pcmk_rsc_managed)) {
4799                 /* We should never get into a situation where a managed remote
4800                  * connection resource is considered OK but a resource action
4801                  * behind the connection gets a "not connected" status. But as a
4802                  * fail-safe in case a bug or unusual circumstances do lead to
4803                  * that, ensure the remote connection is considered failed.
4804                  */
4805                 pe__set_resource_flags(node->details->remote_rsc,
4806                                        pcmk_rsc_failed|pcmk_rsc_stop_if_failed);
4807             }
4808             break; // Not done, do error handling
4809 
4810         case PCMK_EXEC_ERROR:
4811         case PCMK_EXEC_ERROR_HARD:
4812         case PCMK_EXEC_ERROR_FATAL:
4813         case PCMK_EXEC_TIMEOUT:
4814         case PCMK_EXEC_NOT_SUPPORTED:
4815         case PCMK_EXEC_INVALID:
4816             break; // Not done, do error handling
4817 
4818         default: // No other value should be possible at this point
4819             break;
4820     }
4821 
4822     unpack_failure_handling(&history, &failure_strategy, &fail_role);
4823     if ((failure_strategy == pcmk_on_fail_ignore)
4824         || ((failure_strategy == pcmk_on_fail_restart_container)
4825             && (strcmp(history.task, PCMK_ACTION_STOP) == 0))) {
4826 
4827         char *last_change_s = last_change_str(xml_op);
4828 
4829         crm_warn("Pretending failed %s (%s%s%s) of %s on %s at %s succeeded "
4830                  CRM_XS " %s",
4831                  history.task, services_ocf_exitcode_str(history.exit_status),
4832                  (pcmk__str_empty(history.exit_reason)? "" : ": "),
4833                  pcmk__s(history.exit_reason, ""), rsc->id, pe__node_name(node),
4834                  last_change_s, history.id);
4835         free(last_change_s);
4836 
4837         update_resource_state(&history, history.expected_exit_status,
4838                               *last_failure, on_fail);
4839         crm_xml_add(xml_op, XML_ATTR_UNAME, node->details->uname);
4840         pe__set_resource_flags(rsc, pcmk_rsc_ignore_failure);
4841 
4842         record_failed_op(&history);
4843 
4844         if ((failure_strategy == pcmk_on_fail_restart_container)
4845             && cmp_on_fail(*on_fail, pcmk_on_fail_restart) <= 0) {
4846             *on_fail = failure_strategy;
4847         }
4848 
4849     } else {
4850         unpack_rsc_op_failure(&history, failure_strategy, fail_role,
4851                               last_failure, on_fail);
4852 
4853         if (history.execution_status == PCMK_EXEC_ERROR_HARD) {
4854             uint8_t log_level = LOG_ERR;
4855 
4856             if (history.exit_status == PCMK_OCF_NOT_INSTALLED) {
4857                 log_level = LOG_NOTICE;
4858             }
4859             do_crm_log(log_level,
4860                        "Preventing %s from restarting on %s because "
4861                        "of hard failure (%s%s%s) " CRM_XS " %s",
4862                        parent->id, pe__node_name(node),
4863                        services_ocf_exitcode_str(history.exit_status),
4864                        (pcmk__str_empty(history.exit_reason)? "" : ": "),
4865                        pcmk__s(history.exit_reason, ""), history.id);
4866             resource_location(parent, node, -INFINITY, "hard-error",
4867                               rsc->cluster);
4868 
4869         } else if (history.execution_status == PCMK_EXEC_ERROR_FATAL) {
4870             crm_err("Preventing %s from restarting anywhere because "
4871                     "of fatal failure (%s%s%s) " CRM_XS " %s",
4872                     parent->id, services_ocf_exitcode_str(history.exit_status),
4873                     (pcmk__str_empty(history.exit_reason)? "" : ": "),
4874                     pcmk__s(history.exit_reason, ""), history.id);
4875             resource_location(parent, NULL, -INFINITY, "fatal-error",
4876                               rsc->cluster);
4877         }
4878     }
4879 
4880 done:
4881     pe_rsc_trace(rsc, "%s role on %s after %s is %s (next %s)",
4882                  rsc->id, pe__node_name(node), history.id,
4883                  role2text(rsc->role), role2text(rsc->next_role));
4884 }
4885 
4886 static void
4887 add_node_attrs(const xmlNode *xml_obj, pcmk_node_t *node, bool overwrite,
     /* [previous][next][first][last][top][bottom][index][help] */
4888                pcmk_scheduler_t *scheduler)
4889 {
4890     const char *cluster_name = NULL;
4891 
4892     pe_rule_eval_data_t rule_data = {
4893         .node_hash = NULL,
4894         .role = pcmk_role_unknown,
4895         .now = scheduler->now,
4896         .match_data = NULL,
4897         .rsc_data = NULL,
4898         .op_data = NULL
4899     };
4900 
4901     g_hash_table_insert(node->details->attrs,
4902                         strdup(CRM_ATTR_UNAME), strdup(node->details->uname));
4903 
4904     g_hash_table_insert(node->details->attrs, strdup(CRM_ATTR_ID),
4905                         strdup(node->details->id));
4906     if (pcmk__str_eq(node->details->id, scheduler->dc_uuid, pcmk__str_casei)) {
4907         scheduler->dc_node = node;
4908         node->details->is_dc = TRUE;
4909         g_hash_table_insert(node->details->attrs,
4910                             strdup(CRM_ATTR_IS_DC), strdup(XML_BOOLEAN_TRUE));
4911     } else {
4912         g_hash_table_insert(node->details->attrs,
4913                             strdup(CRM_ATTR_IS_DC), strdup(XML_BOOLEAN_FALSE));
4914     }
4915 
4916     cluster_name = g_hash_table_lookup(scheduler->config_hash, "cluster-name");
4917     if (cluster_name) {
4918         g_hash_table_insert(node->details->attrs, strdup(CRM_ATTR_CLUSTER_NAME),
4919                             strdup(cluster_name));
4920     }
4921 
4922     pe__unpack_dataset_nvpairs(xml_obj, XML_TAG_ATTR_SETS, &rule_data,
4923                                node->details->attrs, NULL, overwrite,
4924                                scheduler);
4925 
4926     pe__unpack_dataset_nvpairs(xml_obj, XML_TAG_UTILIZATION, &rule_data,
4927                                node->details->utilization, NULL,
4928                                FALSE, scheduler);
4929 
4930     if (pe_node_attribute_raw(node, CRM_ATTR_SITE_NAME) == NULL) {
4931         const char *site_name = pe_node_attribute_raw(node, "site-name");
4932 
4933         if (site_name) {
4934             g_hash_table_insert(node->details->attrs,
4935                                 strdup(CRM_ATTR_SITE_NAME),
4936                                 strdup(site_name));
4937 
4938         } else if (cluster_name) {
4939             /* Default to cluster-name if unset */
4940             g_hash_table_insert(node->details->attrs,
4941                                 strdup(CRM_ATTR_SITE_NAME),
4942                                 strdup(cluster_name));
4943         }
4944     }
4945 }
4946 
4947 static GList *
4948 extract_operations(const char *node, const char *rsc, xmlNode * rsc_entry, gboolean active_filter)
     /* [previous][next][first][last][top][bottom][index][help] */
4949 {
4950     int counter = -1;
4951     int stop_index = -1;
4952     int start_index = -1;
4953 
4954     xmlNode *rsc_op = NULL;
4955 
4956     GList *gIter = NULL;
4957     GList *op_list = NULL;
4958     GList *sorted_op_list = NULL;
4959 
4960     /* extract operations */
4961     op_list = NULL;
4962     sorted_op_list = NULL;
4963 
4964     for (rsc_op = pcmk__xe_first_child(rsc_entry);
4965          rsc_op != NULL; rsc_op = pcmk__xe_next(rsc_op)) {
4966 
4967         if (pcmk__str_eq((const char *)rsc_op->name, XML_LRM_TAG_RSC_OP,
4968                          pcmk__str_none)) {
4969             crm_xml_add(rsc_op, "resource", rsc);
4970             crm_xml_add(rsc_op, XML_ATTR_UNAME, node);
4971             op_list = g_list_prepend(op_list, rsc_op);
4972         }
4973     }
4974 
4975     if (op_list == NULL) {
4976         /* if there are no operations, there is nothing to do */
4977         return NULL;
4978     }
4979 
4980     sorted_op_list = g_list_sort(op_list, sort_op_by_callid);
4981 
4982     /* create active recurring operations as optional */
4983     if (active_filter == FALSE) {
4984         return sorted_op_list;
4985     }
4986 
4987     op_list = NULL;
4988 
4989     calculate_active_ops(sorted_op_list, &start_index, &stop_index);
4990 
4991     for (gIter = sorted_op_list; gIter != NULL; gIter = gIter->next) {
4992         xmlNode *rsc_op = (xmlNode *) gIter->data;
4993 
4994         counter++;
4995 
4996         if (start_index < stop_index) {
4997             crm_trace("Skipping %s: not active", ID(rsc_entry));
4998             break;
4999 
5000         } else if (counter < start_index) {
5001             crm_trace("Skipping %s: old", ID(rsc_op));
5002             continue;
5003         }
5004         op_list = g_list_append(op_list, rsc_op);
5005     }
5006 
5007     g_list_free(sorted_op_list);
5008     return op_list;
5009 }
5010 
5011 GList *
5012 find_operations(const char *rsc, const char *node, gboolean active_filter,
     /* [previous][next][first][last][top][bottom][index][help] */
5013                 pcmk_scheduler_t *scheduler)
5014 {
5015     GList *output = NULL;
5016     GList *intermediate = NULL;
5017 
5018     xmlNode *tmp = NULL;
5019     xmlNode *status = find_xml_node(scheduler->input, XML_CIB_TAG_STATUS, TRUE);
5020 
5021     pcmk_node_t *this_node = NULL;
5022 
5023     xmlNode *node_state = NULL;
5024 
5025     for (node_state = pcmk__xe_first_child(status); node_state != NULL;
5026          node_state = pcmk__xe_next(node_state)) {
5027 
5028         if (pcmk__str_eq((const char *)node_state->name, XML_CIB_TAG_STATE, pcmk__str_none)) {
5029             const char *uname = crm_element_value(node_state, XML_ATTR_UNAME);
5030 
5031             if (node != NULL && !pcmk__str_eq(uname, node, pcmk__str_casei)) {
5032                 continue;
5033             }
5034 
5035             this_node = pe_find_node(scheduler->nodes, uname);
5036             if(this_node == NULL) {
5037                 CRM_LOG_ASSERT(this_node != NULL);
5038                 continue;
5039 
5040             } else if (pe__is_guest_or_remote_node(this_node)) {
5041                 determine_remote_online_status(scheduler, this_node);
5042 
5043             } else {
5044                 determine_online_status(node_state, this_node, scheduler);
5045             }
5046 
5047             if (this_node->details->online
5048                 || pcmk_is_set(scheduler->flags, pcmk_sched_fencing_enabled)) {
5049                 /* offline nodes run no resources...
5050                  * unless stonith is enabled in which case we need to
5051                  *   make sure rsc start events happen after the stonith
5052                  */
5053                 xmlNode *lrm_rsc = NULL;
5054 
5055                 tmp = find_xml_node(node_state, XML_CIB_TAG_LRM, FALSE);
5056                 tmp = find_xml_node(tmp, XML_LRM_TAG_RESOURCES, FALSE);
5057 
5058                 for (lrm_rsc = pcmk__xe_first_child(tmp); lrm_rsc != NULL;
5059                      lrm_rsc = pcmk__xe_next(lrm_rsc)) {
5060 
5061                     if (pcmk__str_eq((const char *)lrm_rsc->name,
5062                                      XML_LRM_TAG_RESOURCE, pcmk__str_none)) {
5063 
5064                         const char *rsc_id = crm_element_value(lrm_rsc, XML_ATTR_ID);
5065 
5066                         if (rsc != NULL && !pcmk__str_eq(rsc_id, rsc, pcmk__str_casei)) {
5067                             continue;
5068                         }
5069 
5070                         intermediate = extract_operations(uname, rsc_id, lrm_rsc, active_filter);
5071                         output = g_list_concat(output, intermediate);
5072                     }
5073                 }
5074             }
5075         }
5076     }
5077 
5078     return output;
5079 }

/* [previous][next][first][last][top][bottom][index][help] */