'Regular expression in C not working as I expected

I tested this regular expression on a regex checker website and it came back as a match, but when I am trying to implement the logic in c I am not getting a match. Why is Ba-9 not matching with the regex even though it should be matching.

#include <stdio.h>
#include <regex.h>
int isReserved(char* ptr, char* reserved){ //0 is false and 1 is true for bool
    int i = 0;
    while (ptr[i] != '\0') {
        if (ptr[i] == reserved[i]) {
            i++;
        } else
            return 0;
    }
    if(reserved[i] != '\0')
        return 0;
    return 1;
}

int isTAGS(char* ptr){
    char* arr = "TAGS";
    return isReserved(ptr, arr);
}
int isBEGIN(char* ptr){
    char* arr = "BEGIN";
    return isReserved(ptr, arr);
}

int isSEQUENCE(char* ptr){
    char* arr = "SEQUENCE";
    return isReserved(ptr, arr);
}
int isINTEGER(char* ptr){
    char* arr = "INTEGER";
    return isReserved(ptr, arr);
}
int isDATE(char* ptr){
    char* arr = "DATE";
    return isReserved(ptr, arr);
}
int isEND(char* ptr){
    char* arr = "END";
    return isReserved(ptr, arr);
}


int main() {
    regex_t regex;
    int value;
//    regcomp(&regex,"^[A-Z](([0-9a-zA-Z]*-[0-9a-zA-Z]*)*+[0-9a-zA-Z]*)*",0);
    regcomp(&regex,"^[A-Z](([0-9a-zA-Z]*-[0-9a-zA-Z]*)*[0-9a-zA-Z]*)*",0);
    value = regexec(&regex, "Ba-9", 0, NULL, 0);
    printf("%i", value); //0 is a regex match
    return 0;
}


//    char* string = "END";
//    printf("%i",isEND(string));


Solution 1:[1]

Would you please try the following:

#include <stdio.h>
#include <string.h>
#include <regex.h>

int main() {
    regex_t regex;
    int value;
    char str[BUFSIZ];
    FILE *fp;
    if (NULL == (fp = fopen("test.txt", "r"))) {                        // read a test file "test.txt"
        perror("test.txt");
        exit(1);
    }
    regcomp(&regex, "^[A-Z]([0-9a-zA-Z]*-[0-9a-zA-Z]+)*([0-9a-zA-Z]*-[0-9a-zA-Z]*[0-9]+[0-9a-zA-Z]*|[0-9a-zA-Z]*[0-9]+[0-9a-zA-Z]*-[0-9a-zA-Z]+)+(-[0-9a-zA-Z])*$", REG_EXTENDED);
    while (NULL != (fgets(str, BUFSIZ, fp))) {                          // loop over each line
        str[strcspn(str, "\n")] = 0;                                    // remove trailing newline character
        value = regexec(&regex, str, 0, NULL, 0);                       // test the regex
        printf("%-10s: %s\n", str, value ? "no match" : "match");       // show the result
    }
    fclose(fp);
    return 0;
}

test.txt:

Ba-9
Ba
B9-
B-9
B--9
B9--
B-9-
B-9-9
B-9-a
B01-2
B1a-2
B1-A
B-A
BA1-2
BA1-a
BA-2
B-A-2
B-2
B-2a

Result of execution:

Ba-9      : match
Ba        : no match
B9-       : no match
B-9       : match
B--9      : no match
B9--      : no match
B-9-      : no match
B-9-9     : match
B-9-a     : match
B01-2     : match
B1a-2     : match
B1-A      : match
B-A       : no match
BA1-2     : match
BA1-a     : match
BA-2      : match
B-A-2     : match
B-2       : match
B-2a      : match

[Explanation]

Let me simplify [0-9a-zA-Z] as \w and [0-9] as \d just for the explanation purpose, although C regex does not accept these shorthands.
Then the regex

^[A-Z]([0-9a-zA-Z]*-[0-9a-zA-Z]+)*([0-9a-zA-Z]*-[0-9a-zA-Z]*[0-9]+[0-9a-zA-Z]*|[0-9a-zA-Z]*[0-9]+[0-9a-zA-Z]*-[0-9a-zA-Z]+)+(-[0-9a-zA-Z])*$

can be shortened as:

^[A-Z](\w*-\w+)*(\w*-\w*\d+\w*|\w*\d+\w*-\w+)+(-\w)*$

which will be more legible.

  • The most essential part is (\w*-\w*\d+\w*|\w*\d+\w*-\w+)+ which is an alternation of either \w*-\w*\d+\w* or \w*\d+\w*-\w+. This part assures the string contains at least one digit and at least one hyphen, as well as one letter due to the regex [A-Z] at the beginning.
  • The intermedial expressions (\w*-\w+)* and (-\w)* allow extra substring composed of alphanumeric characters and a hyphen.
  • Each subexpression does not allow to end with a hypen, preventing from matching two or more sequential hyphens and/or a hyphen at the end of the string.

Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source
Solution 1 tshiono