'Delete duplicate words from document more efficiently
I compare each word with other and check if that is the duplicate if yes then delete it. For 1 to 4 pages it takes at most 5 minutes.
For a document of 50 or 100 pages I need of modification or a new idea to compare and delete duplicates with less time.
Sub Delete_Duplicates()
'***********'
'By
'MBA
'***********'
Dim AD As Range
Dim F As Range
Dim i As Long
Set AD = ActiveDocument.Range
Z = AD.Words.Count
y = 1
For i = Z To 1 Step -1
y = y + 1
Set F = AD.Words(i)
On Error Resume Next
Set s = AD.Words(i - 1)
If Trim(AD.Words(i - 1)) = "," Then Set s = AD.Words(i - 2): Set c = AD.Words(i - 1)
If Err.Number > 0 Then Exit Sub
If Not F.Text = Chr(13) And UCase(Trim(F.Text)) = UCase(Trim(s.Text)) Then
F.Text = ""
If Not c Is Nothing Then c.Text = " ": Set c = Nothing
End If
If Not c Is Nothing Then Set c = Nothing
On Error Resume Next
Call ProgressBar.Progress(y / Z * 100, True) '<<-- Progress Bar
On Error GoTo 0
Next
Beep
End Sub
Solution 1:[1]
You might try something along the lines of:
Sub Demo()
Application.ScreenUpdating = False
With ActiveDocument.Range
With .Find
.ClearFormatting
.Replacement.ClearFormatting
.Replacement.Text = "\1"
.Forward = True
.Wrap = wdFindContinue
.MatchWildcards = True
.Text = "([A-Za-z0-9'’]@)[, ]@\1"
.Execute
Do While .Found = True
.Execute Replace:=wdReplaceAll
Loop
.Text = "([A-Za-z0-9'’]@[, ]@[A-Za-z0-9'’]@)[, ]@\1"
.Execute
Do While .Found = True
.Execute Replace:=wdReplaceAll
Loop
End With
End With
Application.ScreenUpdating = True
End Sub
Solution 2:[2]
Assuming that the entire document is plain text, we can assign the entire document's text and use Split to convert it into array of words.
Since it's in array, it will be faster to process through them all vs accessing the Words collection.
This is all I can think of but perhaps there's a better way to do this? Below example uses Regex to search through and replace all matched duplicate:
Option Explicit
Sub Delete_Duplicate()
Const maxWord As Long = 2 'Change this to increase the max amount of words should be used to match as a phrase.
Dim fullTxt As String
fullTxt = ActiveDocument.Range.Text
Dim txtArr() As String
txtArr = Split(fullTxt, " ")
Dim regex As RegExp
Set regex = New RegExp
regex.Global = True
regex.IgnoreCase = True
Dim outputTxt As String
outputTxt = fullTxt
Dim n As Long
Dim i As Long
For i = UBound(txtArr) To 0 Step -1
Dim matchWord As String
matchWord = vbNullString
For n = 0 To maxWord - 1
If (i - n) < 0 Then Exit For
matchWord = txtArr(i - n) & " " & matchWord
matchWord = Trim$(Replace(matchWord, vbCr, vbNullString))
regex.Pattern = matchWord & "[, ]{0,}" & matchWord
If regex.test(outputTxt) Then
outputTxt = regex.Replace(outputTxt, matchWord)
End If
Next n
Next i
Set regex = Nothing
Application.UndoRecord.StartCustomRecord "Delete Duplicates"
ActiveDocument.Range.Text = outputTxt
Application.UndoRecord.EndCustomRecord
End Sub
Solution 3:[3]
It is only conception but try to prepare list of all words in document and replace double or triple words if existing.
Private Sub DeleteDuplicate()
Dim wholeTxt As String
Dim w As Range
Dim col As New Collection
Dim c
For Each w In ActiveDocument.Words
AddUniqueItem col, Trim(w.Text)
Next w
wholeTxt = ActiveDocument.Range.Text
For Each c In col
'add case with ","
'maybe one letter word should be forbidden, or add extra boundary
If InStr(1, wholeTxt, c & " " & c, vbBinaryCompare) <> 0 Then
'start of doc
Selection.HomeKey wdStory
'here should be all stuff to prepare replacement
'(...)
Selection.Find.Execute Findtext:=c & " " & c, ReplaceWith:=c
wholeTxt = ActiveDocument.Range.Text
End If
Next c
Set col = Nothing
End Sub
Private Sub AddUniqueItem(ByRef col As Collection, ByVal itemValAndKey As String)
Dim s As String
On Error Resume Next
s = col(itemValAndKey)
If Err.Number <> 0 Then
col.Add itemValAndKey, itemValAndKey
Err.Clear
End If
On Error GoTo 0
End Sub
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
| Solution | Source |
|---|---|
| Solution 1 | macropod |
| Solution 2 | Raymond Wu |
| Solution 3 | deku |

