'Unexpected Values Being Read Into Memory with getline() - C++

I wrote this program for an intro to C++ course. My issue is that unexpected values are being stored in memory. I assume it has to do with input.getline() or the way certain characters are stored, but I don't know enough about what is happening "under the hood" to fix it.

Specifically, certain characters like apostrophes and quotation marks appear to not read as their hex ASCII counterparts.

I'm pretty certain the issue lies in the lines

input.getline(raw_paragraph, MAX_PARAGRAPH_CHARS);
charCount = strlen(raw_paragraph);

Below I've included the complete code, a screenshot of the Memory from Visual Studio 2022, the test case, and the program output .

Thank you in advance!

#pragma warning(disable : 4996) //DEV

/**************************************************************************************
    Header Content
**************************************************************************************/

// Includes and namespaces ------------------------------------------------------------

#include <cstdlib>       // Defines functions such as exit().
#include <cstring>       // Defines functions such as strcmp, etc.
#include <fstream>       // Supports file I/O
#include <iostream>      // Supports terminal I/O

using namespace std;

// Constants Declared -----------------------------------------------------------------

// Maximum allowable space for input / Defines space for memory allocation
const int MAX_WORD_CHARS = 50;         // Longest word = 50 chars
const int MAX_WORDS = 1000;            // Longest paragraph = 1000 words
const int MAX_PARAGRAPH_CHARS = 50000; // 50 * 1000

// "to be" Semantics
const char TO[] = "to";
const char BE[] = "be";
const int NUM_TO_BE_VERBS = 5;  // Qty of "to be verbs below
const char TO_BE_VERBS[NUM_TO_BE_VERBS][MAX_WORD_CHARS] =
                              { "am", "are", "is", "was", "were" };

// Conjunctions 
const int NUM_CONJUNCTIONS = 7; // Qty objects in CONJUNCTIONS below.
const char CONJUNCTIONS[NUM_CONJUNCTIONS][MAX_WORD_CHARS] =
                 { "for", "and", "nor", "but", "or", "yet", "so" };

// Punctuation
const int NUM_PUNCTUATIONS = 4;
const char PUNCTUATIONS[NUM_PUNCTUATIONS] = { '.', ',', '?', '!' };


// Functions Declared -----------------------------------------------------------------

int countComplex(char a[][MAX_WORD_CHARS], int b);
int countSentences(char a[], int b);
int count_to_be_verbs(char a[][MAX_WORD_CHARS], int wc);
void init_array(char* a);
void modify_tokens(char a[][MAX_WORD_CHARS], int wc);
int tokenizeParagraph(char p[], char tp[][MAX_WORD_CHARS]);


/**************************************************************************************
    Begin Main 
**************************************************************************************/

int main()
{
    // Format Output
    cout.setf(ios::fixed);
    cout.setf(ios::showpoint);
    cout.precision(1);

    // Create input space for user's file request
    char filename[256];       // Stores the user defined filename containing plaintext
    init_array(filename);

    char raw_paragraph[MAX_PARAGRAPH_CHARS]; // Stores plaintext from filename
    init_array(raw_paragraph);

    // Declare Variables
    int charCount = 0;        // Number of chars contained in input file except eof.
    int complex_count;        // Number of complex sentences
    int sentenceCount = 0;    // Total number of sentences in input
    int simpleSent = 0;       // Number of simple sentences in input
    int to_be_count;          // Number of instances of "to be" verbs in input.
    int wordCount = 0;        // Number of words in input

    double averageWordsPerSentence;

    // Asks the user for the name of an input file which contains a paragraph
    cout << "Enter a filename: ";
    cin.getline(filename, 256);

    // Try to load the file in filename:
    ifstream input;
    input.open(filename);

    // If file does not exist, cout error then exit(1)
    if (input.fail())
    {
        cout << "Input file " << filename << " does not exist." << endl;
        cout << "Thank you for using the English Analyzer." << endl;
        exit(1);
    }

    // If file is empty, cout "Input file _____ is empty." Then exit(1)
    char c;
    input.get(c);
    if (input.eof())
    {
        cout << "File " << filename << " is empty." << endl;
        cout << "Thank you for using the English Analyzer." << endl;
        exit(1);
    }
    else
        input.putback(c);

    // Store plaintext from file to raw_paragraph
    input.getline(raw_paragraph, MAX_PARAGRAPH_CHARS);

    // Close ifstream input, will not need it again.
    input.close();

    // Allocate memory for the output of tokenizeParagraph
    char tkn_para[MAX_WORDS][MAX_WORD_CHARS];

    // Count chars
    charCount = strlen(raw_paragraph);

    // Tokenize paragraph, count words
    wordCount = tokenizeParagraph(raw_paragraph, tkn_para);

    // Count Sentences
    sentenceCount = countSentences(raw_paragraph, charCount);

    // Average words per sentence
    averageWordsPerSentence = double(wordCount) / double(sentenceCount);

    // Count Complex Sentences
    complex_count = countComplex(tkn_para, wordCount);

    // Calculate Simple Sentences
    simpleSent = sentenceCount - complex_count;
    
    // Count "to be" verbs
    modify_tokens(tkn_para, wordCount);
    to_be_count = count_to_be_verbs(tkn_para, wordCount);

    // Cout results
    cout << "Number of Characters: " << charCount << endl;
    cout << "Number of words: " << wordCount << endl;
    cout << "Number of sentences: " << sentenceCount << endl;
    cout << "Average number words in a sentence: " << averageWordsPerSentence << endl;
    cout << "Number of simple sentences: " << simpleSent << endl;
    cout << "Number of \"to be\" verbs: " << to_be_count << endl;
}


/**************************************************************************************
    Function Definitions
**************************************************************************************/

int countComplex(char a[][MAX_WORD_CHARS], int b)
{
    // counter will keep the number of complex sentences found.
    int counter = 0;

    // For each word in tkn_para,
    for (int i = 0; i < b; i++)
    {
        // If a comma is at the end of tkn_m,
        int s = strlen(a[i]) -1;
        if (a[i][s] == ',')                                  
        {
            // For each word in CONJUNCTIONS
            for (int x = 0; x < NUM_CONJUNCTIONS; x++)
            {
                // If the words match,
                if (strcmp(a[i + 1], CONJUNCTIONS[x]) == 0)
                {
                    // Increment counter
                    counter++; 
                    
                    // If a word from a has already been matched, there
                    // is no reason to try to compare it to more items
                    // from CONJUNCTIONS. Therefore,
                    break;
                }
            }
        }
    }

    // After all iteration has been completed:
    return(counter);
}

int countSentences(char a[], int b)
{
    int counter = 0;

    // For each char in a[]
    for (int i = 0; i < b; i++)
    {
        // If a[i] is an end of sentence punctuation,
        if (a[i] == '.' || a[i] == '?' || a[i] == '!')
            // Increment counter
            counter++;
    }
    return counter;
}

int count_to_be_verbs(char a[][MAX_WORD_CHARS], int wc)
{
    int counter = 0;

    // For each word in a:
    for (int i = 0; i < wc; i++)

        // For each word in TO_BE_VERBS:
        for (int y = 0; y < NUM_TO_BE_VERBS; y++)
        {
            // If words match:
            if (strcmp(a[i], TO_BE_VERBS[y]) == 0)
                counter++;
        }

    // For loop checks for "to" token followed by "be"
    for (int i = 0; i < wc; i++)
        if (strcmp(a[i], TO) == 0 && strcmp(a[i + 1], BE) == 0)
            counter++;

    return(counter);
}

void init_array(char* a)
{
    // For every char in a:
    for (int i = 0; i < strlen(a); i++)
        // Set the value of a[i] to NULL
        a[i] = NULL;
}

void modify_tokens(char a[][MAX_WORD_CHARS], int wc)
{
    // For each word in a:
    for (int i = 0; i < wc; i++)
    {
        // Does the computation once instead of 4 times below.
        int s = strlen(a[i]) -1;

        // Converts first char if uppercase, into lowercase
        if (int('@') < a[i][0] && a[i][0] < int('['))
            a[i][0] = a[i][0] + 32;

        // Convert last char, if punctuation mark, into NULL
        if (a[i][s] == ',' || a[i][s] == '!' || a[i][s] == '?' || a[i][s] == '.')
            a[i][s] = NULL;
    }
}

int tokenizeParagraph(char p[], char tp[][MAX_WORD_CHARS])
{
    int i = 0;
    char* cPtr;
    cPtr = strtok(p, " \n\t");
    while (cPtr != NULL)
    {
        strcpy(tp[i], cPtr);
        i++;
        cPtr = strtok(NULL, " \n\t");
    }
    return(i);
}

c++visual-studio memory fstream getline

Solution 1:^[1]

Turns out that the issue has to do with copying the test case into MS Word or another like application. There are character equivalents to apostrophes and quotation marks that "lean" left or right. Those characters are actually distinct and are responsible for the memory values I've been encountering. It suggests to me that a future iteration of the code would have to parse the raw input for those types of characters and replace them.

Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution	Source
Solution 1	Undercaffinated

'Unexpected Values Being Read Into Memory with getline() - C++

Solution 1:[1]

Sources

Related Questions

Solution 1:^[1]