'Delete duplicate words from document more efficiently

I compare each word with other and check if that is the duplicate if yes then delete it. For 1 to 4 pages it takes at most 5 minutes.

For a document of 50 or 100 pages I need of modification or a new idea to compare and delete duplicates with less time.

Sub Delete_Duplicates()
    '***********'
    'By
    'MBA
    '***********'
    Dim AD As Range
    Dim F As Range
    Dim i As Long
    
    Set AD = ActiveDocument.Range
    Z = AD.Words.Count
    y = 1
    For i = Z To 1 Step -1
        y = y + 1
        
        Set F = AD.Words(i)
        
        On Error Resume Next
        Set s = AD.Words(i - 1)
        If Trim(AD.Words(i - 1)) = "," Then Set s = AD.Words(i - 2): Set c = AD.Words(i - 1)
        If Err.Number > 0 Then Exit Sub
            
        If Not F.Text = Chr(13) And UCase(Trim(F.Text)) = UCase(Trim(s.Text)) Then
            F.Text = ""
            If Not c Is Nothing Then c.Text = " ": Set c = Nothing
        End If
        If Not c Is Nothing Then Set c = Nothing
    
        On Error Resume Next
        Call ProgressBar.Progress(y / Z * 100, True) '<<-- Progress Bar
        On Error GoTo 0
    
    Next
    Beep
End Sub

Before/After
enter image description here



Solution 1:[1]

You might try something along the lines of:

Sub Demo()
Application.ScreenUpdating = False
With ActiveDocument.Range
  With .Find
    .ClearFormatting
    .Replacement.ClearFormatting
    .Replacement.Text = "\1"
    .Forward = True
    .Wrap = wdFindContinue
    .MatchWildcards = True
    .Text = "([A-Za-z0-9'’]@)[, ]@\1"
    .Execute
    Do While .Found = True
      .Execute Replace:=wdReplaceAll
    Loop
    .Text = "([A-Za-z0-9'’]@[, ]@[A-Za-z0-9'’]@)[, ]@\1"
    .Execute
    Do While .Found = True
      .Execute Replace:=wdReplaceAll
    Loop
  End With
End With
Application.ScreenUpdating = True
End Sub

Solution 2:[2]

Assuming that the entire document is plain text, we can assign the entire document's text and use Split to convert it into array of words.

Since it's in array, it will be faster to process through them all vs accessing the Words collection.

This is all I can think of but perhaps there's a better way to do this? Below example uses Regex to search through and replace all matched duplicate:

Option Explicit

Sub Delete_Duplicate()
    Const maxWord As Long = 2 'Change this to increase the max amount of words should be used to match as a phrase.
    
    Dim fullTxt As String
    fullTxt = ActiveDocument.Range.Text
    
    Dim txtArr() As String
    txtArr = Split(fullTxt, " ")
    
    Dim regex As RegExp
    Set regex = New RegExp
    regex.Global = True
    regex.IgnoreCase = True
    
    Dim outputTxt As String
    outputTxt = fullTxt
    
    Dim n As Long
    Dim i As Long
    
    For i = UBound(txtArr) To 0 Step -1
        Dim matchWord As String
        
        matchWord = vbNullString
        For n = 0 To maxWord - 1
            If (i - n) < 0 Then Exit For
            
            matchWord = txtArr(i - n) & " " & matchWord
            matchWord = Trim$(Replace(matchWord, vbCr, vbNullString))
        
            regex.Pattern = matchWord & "[, ]{0,}" & matchWord
            If regex.test(outputTxt) Then
                outputTxt = regex.Replace(outputTxt, matchWord)
            End If
        Next n
    Next i
    Set regex = Nothing

    Application.UndoRecord.StartCustomRecord "Delete Duplicates"
    ActiveDocument.Range.Text = outputTxt
    Application.UndoRecord.EndCustomRecord
End Sub

Solution 3:[3]

It is only conception but try to prepare list of all words in document and replace double or triple words if existing.

Private Sub DeleteDuplicate()
    
    Dim wholeTxt As String
    
    Dim w As Range
    Dim col As New Collection
    Dim c
    
    For Each w In ActiveDocument.Words
        AddUniqueItem col, Trim(w.Text)
    Next w

    wholeTxt = ActiveDocument.Range.Text
    
    For Each c In col
        
        'add case with ","
        'maybe one letter word should be forbidden, or add extra boundary
        If InStr(1, wholeTxt, c & " " & c, vbBinaryCompare) <> 0 Then
            'start of doc
            Selection.HomeKey wdStory
            
            'here should be all stuff to prepare replacement
            '(...)
            Selection.Find.Execute Findtext:=c & " " & c, ReplaceWith:=c
            wholeTxt = ActiveDocument.Range.Text
        End If
    Next c
    
    Set col = Nothing
End Sub
Private Sub AddUniqueItem(ByRef col As Collection, ByVal itemValAndKey As String)
    Dim s As String
    On Error Resume Next
    s = col(itemValAndKey)
    If Err.Number <> 0 Then
        col.Add itemValAndKey, itemValAndKey
        Err.Clear
    End If
    On Error GoTo 0
End Sub

Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source
Solution 1 macropod
Solution 2 Raymond Wu
Solution 3 deku