root/lib/pengine/failcounts.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. is_matched_failure
  2. block_failure
  3. rsc_fail_name
  4. generate_fail_regex
  5. generate_fail_regexes
  6. update_failcount_for_attr
  7. update_failcount_for_filler
  8. pe_get_failcount
  9. pe__clear_failcount

   1 /*
   2  * Copyright 2008-2023 the Pacemaker project contributors
   3  *
   4  * This source code is licensed under the GNU Lesser General Public License
   5  * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY.
   6  */
   7 
   8 #include <crm_internal.h>
   9 
  10 #include <sys/types.h>
  11 #include <regex.h>
  12 #include <glib.h>
  13 
  14 #include <crm/crm.h>
  15 #include <crm/msg_xml.h>
  16 #include <crm/common/xml.h>
  17 #include <crm/common/util.h>
  18 #include <crm/pengine/internal.h>
  19 
  20 static gboolean
  21 is_matched_failure(const char *rsc_id, const xmlNode *conf_op_xml,
     /* [previous][next][first][last][top][bottom][index][help] */
  22                    const xmlNode *lrm_op_xml)
  23 {
  24     gboolean matched = FALSE;
  25     const char *conf_op_name = NULL;
  26     const char *lrm_op_task = NULL;
  27     const char *conf_op_interval_spec = NULL;
  28     guint conf_op_interval_ms = 0;
  29     guint lrm_op_interval_ms = 0;
  30     const char *lrm_op_id = NULL;
  31     char *last_failure_key = NULL;
  32 
  33     if (rsc_id == NULL || conf_op_xml == NULL || lrm_op_xml == NULL) {
  34         return FALSE;
  35     }
  36 
  37     // Get name and interval from configured op
  38     conf_op_name = crm_element_value(conf_op_xml, "name");
  39     conf_op_interval_spec = crm_element_value(conf_op_xml,
  40                                               XML_LRM_ATTR_INTERVAL);
  41     conf_op_interval_ms = crm_parse_interval_spec(conf_op_interval_spec);
  42 
  43     // Get name and interval from op history entry
  44     lrm_op_task = crm_element_value(lrm_op_xml, XML_LRM_ATTR_TASK);
  45     crm_element_value_ms(lrm_op_xml, XML_LRM_ATTR_INTERVAL_MS,
  46                          &lrm_op_interval_ms);
  47 
  48     if ((conf_op_interval_ms != lrm_op_interval_ms)
  49         || !pcmk__str_eq(conf_op_name, lrm_op_task, pcmk__str_casei)) {
  50         return FALSE;
  51     }
  52 
  53     lrm_op_id = ID(lrm_op_xml);
  54     last_failure_key = pcmk__op_key(rsc_id, "last_failure", 0);
  55 
  56     if (pcmk__str_eq(last_failure_key, lrm_op_id, pcmk__str_casei)) {
  57         matched = TRUE;
  58 
  59     } else {
  60         char *expected_op_key = pcmk__op_key(rsc_id, conf_op_name,
  61                                                 conf_op_interval_ms);
  62 
  63         if (pcmk__str_eq(expected_op_key, lrm_op_id, pcmk__str_casei)) {
  64             int rc = 0;
  65             int target_rc = pe__target_rc_from_xml(lrm_op_xml);
  66 
  67             crm_element_value_int(lrm_op_xml, XML_LRM_ATTR_RC, &rc);
  68             if (rc != target_rc) {
  69                 matched = TRUE;
  70             }
  71         }
  72         free(expected_op_key);
  73     }
  74 
  75     free(last_failure_key);
  76     return matched;
  77 }
  78 
  79 static gboolean
  80 block_failure(const pcmk_node_t *node, pcmk_resource_t *rsc,
     /* [previous][next][first][last][top][bottom][index][help] */
  81               const xmlNode *xml_op)
  82 {
  83     char *xml_name = clone_strip(rsc->id);
  84 
  85     /* @TODO This xpath search occurs after template expansion, but it is unable
  86      * to properly detect on-fail in id-ref, operation meta-attributes, or
  87      * op_defaults, or evaluate rules.
  88      *
  89      * Also, on-fail defaults to block (in unpack_operation()) for stop actions
  90      * when stonith is disabled.
  91      *
  92      * Ideally, we'd unpack the operation before this point, and pass in a
  93      * meta-attributes table that takes all that into consideration.
  94      */
  95     char *xpath = crm_strdup_printf("//" XML_CIB_TAG_RESOURCE
  96                                     "[@" XML_ATTR_ID "='%s']"
  97                                     "//" XML_ATTR_OP
  98                                     "[@" XML_OP_ATTR_ON_FAIL "='block']",
  99                                     xml_name);
 100 
 101     xmlXPathObject *xpathObj = xpath_search(rsc->xml, xpath);
 102     gboolean should_block = FALSE;
 103 
 104     free(xpath);
 105 
 106     if (xpathObj) {
 107         int max = numXpathResults(xpathObj);
 108         int lpc = 0;
 109 
 110         for (lpc = 0; lpc < max; lpc++) {
 111             xmlNode *pref = getXpathResult(xpathObj, lpc);
 112 
 113             if (xml_op) {
 114                 should_block = is_matched_failure(xml_name, pref, xml_op);
 115                 if (should_block) {
 116                     break;
 117                 }
 118 
 119             } else {
 120                 const char *conf_op_name = NULL;
 121                 const char *conf_op_interval_spec = NULL;
 122                 guint conf_op_interval_ms = 0;
 123                 char *lrm_op_xpath = NULL;
 124                 xmlXPathObject *lrm_op_xpathObj = NULL;
 125 
 126                 // Get name and interval from configured op
 127                 conf_op_name = crm_element_value(pref, "name");
 128                 conf_op_interval_spec = crm_element_value(pref, XML_LRM_ATTR_INTERVAL);
 129                 conf_op_interval_ms = crm_parse_interval_spec(conf_op_interval_spec);
 130 
 131 #define XPATH_FMT "//" XML_CIB_TAG_STATE "[@" XML_ATTR_UNAME "='%s']"       \
 132                   "//" XML_LRM_TAG_RESOURCE "[@" XML_ATTR_ID "='%s']"       \
 133                   "/" XML_LRM_TAG_RSC_OP "[@" XML_LRM_ATTR_TASK "='%s']"    \
 134                   "[@" XML_LRM_ATTR_INTERVAL "='%u']"
 135 
 136                 lrm_op_xpath = crm_strdup_printf(XPATH_FMT,
 137                                                  node->details->uname, xml_name,
 138                                                  conf_op_name,
 139                                                  conf_op_interval_ms);
 140                 lrm_op_xpathObj = xpath_search(rsc->cluster->input, lrm_op_xpath);
 141 
 142                 free(lrm_op_xpath);
 143 
 144                 if (lrm_op_xpathObj) {
 145                     int max2 = numXpathResults(lrm_op_xpathObj);
 146                     int lpc2 = 0;
 147 
 148                     for (lpc2 = 0; lpc2 < max2; lpc2++) {
 149                         xmlNode *lrm_op_xml = getXpathResult(lrm_op_xpathObj,
 150                                                              lpc2);
 151 
 152                         should_block = is_matched_failure(xml_name, pref,
 153                                                           lrm_op_xml);
 154                         if (should_block) {
 155                             break;
 156                         }
 157                     }
 158                 }
 159                 freeXpathObject(lrm_op_xpathObj);
 160 
 161                 if (should_block) {
 162                     break;
 163                 }
 164             }
 165         }
 166     }
 167 
 168     free(xml_name);
 169     freeXpathObject(xpathObj);
 170 
 171     return should_block;
 172 }
 173 
 174 /*!
 175  * \internal
 176  * \brief Get resource name as used in failure-related node attributes
 177  *
 178  * \param[in] rsc  Resource to check
 179  *
 180  * \return Newly allocated string containing resource's fail name
 181  * \note The caller is responsible for freeing the result.
 182  */
 183 static inline char *
 184 rsc_fail_name(const pcmk_resource_t *rsc)
     /* [previous][next][first][last][top][bottom][index][help] */
 185 {
 186     const char *name = (rsc->clone_name? rsc->clone_name : rsc->id);
 187 
 188     return pcmk_is_set(rsc->flags, pcmk_rsc_unique)? strdup(name) : clone_strip(name);
 189 }
 190 
 191 /*!
 192  * \internal
 193  * \brief Compile regular expression to match a failure-related node attribute
 194  *
 195  * \param[in]  prefix    Attribute prefix to match
 196  * \param[in]  rsc_name  Resource name to match as used in failure attributes
 197  * \param[in]  is_legacy Whether DC uses per-resource fail counts
 198  * \param[in]  is_unique Whether the resource is a globally unique clone
 199  * \param[out] re        Where to store resulting regular expression
 200  *
 201  * \return Standard Pacemaker return code
 202  * \note Fail attributes are named like PREFIX-RESOURCE#OP_INTERVAL.
 203  *       The caller is responsible for freeing re with regfree().
 204  */
 205 static int
 206 generate_fail_regex(const char *prefix, const char *rsc_name,
     /* [previous][next][first][last][top][bottom][index][help] */
 207                     gboolean is_legacy, gboolean is_unique, regex_t *re)
 208 {
 209     char *pattern;
 210 
 211     /* @COMPAT DC < 1.1.17: Fail counts used to be per-resource rather than
 212      * per-operation.
 213      */
 214     const char *op_pattern = (is_legacy? "" : "#.+_[0-9]+");
 215 
 216     /* Ignore instance numbers for anything other than globally unique clones.
 217      * Anonymous clone fail counts could contain an instance number if the
 218      * clone was initially unique, failed, then was converted to anonymous.
 219      * @COMPAT Also, before 1.1.8, anonymous clone fail counts always contained
 220      * clone instance numbers.
 221      */
 222     const char *instance_pattern = (is_unique? "" : "(:[0-9]+)?");
 223 
 224     pattern = crm_strdup_printf("^%s-%s%s%s$", prefix, rsc_name,
 225                                 instance_pattern, op_pattern);
 226     if (regcomp(re, pattern, REG_EXTENDED|REG_NOSUB) != 0) {
 227         free(pattern);
 228         return EINVAL;
 229     }
 230 
 231     free(pattern);
 232     return pcmk_rc_ok;
 233 }
 234 
 235 /*!
 236  * \internal
 237  * \brief Compile regular expressions to match failure-related node attributes
 238  *
 239  * \param[in]  rsc             Resource being checked for failures
 240  * \param[out] failcount_re    Storage for regular expression for fail count
 241  * \param[out] lastfailure_re  Storage for regular expression for last failure
 242  *
 243  * \return Standard Pacemaker return code
 244  * \note On success, the caller is responsible for freeing the expressions with
 245  *       regfree().
 246  */
 247 static int
 248 generate_fail_regexes(const pcmk_resource_t *rsc,
     /* [previous][next][first][last][top][bottom][index][help] */
 249                       regex_t *failcount_re, regex_t *lastfailure_re)
 250 {
 251     int rc = pcmk_rc_ok;
 252     char *rsc_name = rsc_fail_name(rsc);
 253     const char *version = crm_element_value(rsc->cluster->input,
 254                                             XML_ATTR_CRM_VERSION);
 255 
 256     // @COMPAT Pacemaker <= 1.1.16 used a single fail count per resource
 257     gboolean is_legacy = (compare_version(version, "3.0.13") < 0);
 258 
 259     if (generate_fail_regex(PCMK__FAIL_COUNT_PREFIX, rsc_name, is_legacy,
 260                             pcmk_is_set(rsc->flags, pcmk_rsc_unique),
 261                             failcount_re) != pcmk_rc_ok) {
 262         rc = EINVAL;
 263 
 264     } else if (generate_fail_regex(PCMK__LAST_FAILURE_PREFIX, rsc_name,
 265                                    is_legacy,
 266                                    pcmk_is_set(rsc->flags, pcmk_rsc_unique),
 267                                    lastfailure_re) != pcmk_rc_ok) {
 268         rc = EINVAL;
 269         regfree(failcount_re);
 270     }
 271 
 272     free(rsc_name);
 273     return rc;
 274 }
 275 
 276 // Data for fail-count-related iterators
 277 struct failcount_data {
 278     const pcmk_node_t *node;// Node to check for fail count
 279     pcmk_resource_t *rsc;     // Resource to check for fail count
 280     uint32_t flags;         // Fail count flags
 281     const xmlNode *xml_op;  // History entry for expiration purposes (or NULL)
 282     regex_t failcount_re;   // Fail count regular expression to match
 283     regex_t lastfailure_re; // Last failure regular expression to match
 284     int failcount;          // Fail count so far
 285     time_t last_failure;    // Time of most recent failure so far
 286 };
 287 
 288 /*!
 289  * \internal
 290  * \brief Update fail count and last failure appropriately for a node attribute
 291  *
 292  * \param[in] key        Node attribute name
 293  * \param[in] value      Node attribute value
 294  * \param[in] user_data  Fail count data to update
 295  */
 296 static void
 297 update_failcount_for_attr(gpointer key, gpointer value, gpointer user_data)
     /* [previous][next][first][last][top][bottom][index][help] */
 298 {
 299     struct failcount_data *fc_data = user_data;
 300 
 301     // If this is a matching fail count attribute, update fail count
 302     if (regexec(&(fc_data->failcount_re), (const char *) key, 0, NULL, 0) == 0) {
 303         fc_data->failcount = pcmk__add_scores(fc_data->failcount,
 304                                               char2score(value));
 305         pe_rsc_trace(fc_data->rsc, "Added %s (%s) to %s fail count (now %s)",
 306                      (const char *) key, (const char *) value, fc_data->rsc->id,
 307                      pcmk_readable_score(fc_data->failcount));
 308         return;
 309     }
 310 
 311     // If this is a matching last failure attribute, update last failure
 312     if (regexec(&(fc_data->lastfailure_re), (const char *) key, 0, NULL,
 313                 0) == 0) {
 314         long long last_ll;
 315 
 316         if (pcmk__scan_ll(value, &last_ll, 0LL) == pcmk_rc_ok) {
 317             fc_data->last_failure = (time_t) QB_MAX(fc_data->last_failure,
 318                                                     last_ll);
 319         }
 320     }
 321 }
 322 
 323 /*!
 324  * \internal
 325  * \brief Update fail count and last failure appropriately for a filler resource
 326  *
 327  * \param[in] data       Filler resource
 328  * \param[in] user_data  Fail count data to update
 329  */
 330 static void
 331 update_failcount_for_filler(gpointer data, gpointer user_data)
     /* [previous][next][first][last][top][bottom][index][help] */
 332 {
 333     pcmk_resource_t *filler = data;
 334     struct failcount_data *fc_data = user_data;
 335     time_t filler_last_failure = 0;
 336 
 337     fc_data->failcount += pe_get_failcount(fc_data->node, filler,
 338                                            &filler_last_failure, fc_data->flags,
 339                                            fc_data->xml_op);
 340     fc_data->last_failure = QB_MAX(fc_data->last_failure, filler_last_failure);
 341 }
 342 
 343 /*!
 344  * \internal
 345  * \brief Get a resource's fail count on a node
 346  *
 347  * \param[in]     node          Node to check
 348  * \param[in,out] rsc           Resource to check
 349  * \param[out]    last_failure  If not NULL, where to set time of most recent
 350  *                              failure of \p rsc on \p node
 351  * \param[in]     flags         Group of enum pcmk__fc_flags
 352  * \param[in]     xml_op        If not NULL, consider only the action in this
 353  *                              history entry when determining whether on-fail
 354  *                              is configured as "blocked", otherwise consider
 355  *                              all actions configured for \p rsc
 356  *
 357  * \return Fail count for \p rsc on \p node according to \p flags
 358  */
 359 int
 360 pe_get_failcount(const pcmk_node_t *node, pcmk_resource_t *rsc,
     /* [previous][next][first][last][top][bottom][index][help] */
 361                  time_t *last_failure, uint32_t flags, const xmlNode *xml_op)
 362 {
 363     struct failcount_data fc_data = {
 364         .node = node,
 365         .rsc = rsc,
 366         .flags = flags,
 367         .xml_op = xml_op,
 368         .failcount = 0,
 369         .last_failure = (time_t) 0,
 370     };
 371 
 372     // Calculate resource failcount as sum of all matching operation failcounts
 373     CRM_CHECK(generate_fail_regexes(rsc, &fc_data.failcount_re,
 374                                     &fc_data.lastfailure_re) == pcmk_rc_ok,
 375               return 0);
 376     g_hash_table_foreach(node->details->attrs, update_failcount_for_attr,
 377                          &fc_data);
 378     regfree(&(fc_data.failcount_re));
 379     regfree(&(fc_data.lastfailure_re));
 380 
 381     // If failure blocks the resource, disregard any failure timeout
 382     if ((fc_data.failcount > 0) && (rsc->failure_timeout > 0)
 383         && block_failure(node, rsc, xml_op)) {
 384 
 385         pe_warn("Ignoring failure timeout %d for %s "
 386                 "because it conflicts with on-fail=block",
 387                 rsc->failure_timeout, rsc->id);
 388         rsc->failure_timeout = 0;
 389     }
 390 
 391     // If all failures have expired, ignore fail count
 392     if (pcmk_is_set(flags, pcmk__fc_effective) && (fc_data.failcount > 0)
 393         && (fc_data.last_failure > 0) && (rsc->failure_timeout != 0)) {
 394 
 395         time_t now = get_effective_time(rsc->cluster);
 396 
 397         if (now > (fc_data.last_failure + rsc->failure_timeout)) {
 398             pe_rsc_debug(rsc, "Failcount for %s on %s expired after %ds",
 399                          rsc->id, pe__node_name(node), rsc->failure_timeout);
 400             fc_data.failcount = 0;
 401         }
 402     }
 403 
 404     /* Add the fail count of any filler resources, except that we never want the
 405      * fail counts of a bundle container's fillers to count towards the
 406      * container's fail count.
 407      *
 408      * Most importantly, a Pacemaker Remote connection to a bundle container
 409      * is a filler of the container, but can reside on a different node than the
 410      * container itself. Counting its fail count on its node towards the
 411      * container's fail count on that node could lead to attempting to stop the
 412      * container on the wrong node.
 413      */
 414     if (pcmk_is_set(flags, pcmk__fc_fillers) && (rsc->fillers != NULL)
 415         && !pe_rsc_is_bundled(rsc)) {
 416 
 417         g_list_foreach(rsc->fillers, update_failcount_for_filler, &fc_data);
 418         if (fc_data.failcount > 0) {
 419             pe_rsc_info(rsc,
 420                         "Container %s and the resources within it "
 421                         "have failed %s time%s on %s",
 422                         rsc->id, pcmk_readable_score(fc_data.failcount),
 423                         pcmk__plural_s(fc_data.failcount), pe__node_name(node));
 424         }
 425 
 426     } else if (fc_data.failcount > 0) {
 427         pe_rsc_info(rsc, "%s has failed %s time%s on %s",
 428                     rsc->id, pcmk_readable_score(fc_data.failcount),
 429                     pcmk__plural_s(fc_data.failcount), pe__node_name(node));
 430     }
 431 
 432     if (last_failure != NULL) {
 433         if ((fc_data.failcount > 0) && (fc_data.last_failure > 0)) {
 434             *last_failure = fc_data.last_failure;
 435         } else  {
 436             *last_failure = 0;
 437         }
 438     }
 439     return fc_data.failcount;
 440 }
 441 
 442 /*!
 443  * \brief Schedule a controller operation to clear a fail count
 444  *
 445  * \param[in,out] rsc        Resource with failure
 446  * \param[in]     node       Node failure occurred on
 447  * \param[in]     reason     Readable description why needed (for logging)
 448  * \param[in,out] scheduler  Scheduler data cluster
 449  *
 450  * \return Scheduled action
 451  */
 452 pcmk_action_t *
 453 pe__clear_failcount(pcmk_resource_t *rsc, const pcmk_node_t *node,
     /* [previous][next][first][last][top][bottom][index][help] */
 454                     const char *reason, pcmk_scheduler_t *scheduler)
 455 {
 456     char *key = NULL;
 457     pcmk_action_t *clear = NULL;
 458 
 459     CRM_CHECK(rsc && node && reason && scheduler, return NULL);
 460 
 461     key = pcmk__op_key(rsc->id, PCMK_ACTION_CLEAR_FAILCOUNT, 0);
 462     clear = custom_action(rsc, key, PCMK_ACTION_CLEAR_FAILCOUNT, node, FALSE,
 463                           scheduler);
 464     add_hash_param(clear->meta, XML_ATTR_TE_NOWAIT, XML_BOOLEAN_TRUE);
 465     crm_notice("Clearing failure of %s on %s because %s " CRM_XS " %s",
 466                rsc->id, pe__node_name(node), reason, clear->uuid);
 467     return clear;
 468 }

/* [previous][next][first][last][top][bottom][index][help] */