Public Class Form1 Const LOW_SCORE_CUTOFF As Integer = 20 Const SPACE_REPLACEMENT As Char = "^" Const QUOTE_REPLACEMENT As Char = "~" Const LF_REPLACEMENT As Char = "|" Private Class phraseData Public Key As String = "" Public Frequency As Integer = 0 Public PhraseCost As Integer = 0 Public ReadOnly Property Score As Integer Get Return (Frequency * (PhraseCost - 2)) - PhraseCost End Get End Property End Class Private Sub btnAbbreviations_Click(sender As Object, e As EventArgs) Handles btnAbbreviations.Click Dim bSearch34 As Boolean = False Dim bSearchCR As Boolean = False Dim WholeTextList As List(Of String) = New List(Of String) Dim dictFreq As New Dictionary(Of String, phraseData) Dim iNewWinnerThreshold As Integer = LOW_SCORE_CUTOFF txtResult.Text = "" Dim dtTotalStart As Date = Date.Now ' Get text from zap-files and store every line in a list of strings. ' The ".GSTR", ".STRL", "PRINTI" and "PRINTR" Op-codes contains the text. ' Every phrase is stored in a dictionary with the phrase as key. Dim dtStart As Date = Date.Now txtResult.Text = "Indexing files..." txtResult.Refresh() For Each fileZAP As String In IO.Directory.GetFiles(IO.Path.GetDirectoryName(Application.ExecutablePath) & "\Text") Dim lStartPos As Long = 0 If IO.Path.GetExtension(fileZAP).ToUpper = ".ZAP" Then Dim byteText() As Byte = IO.File.ReadAllBytes(fileZAP) For i As Long = 5 To byteText.Length - 1 Dim opCodeString As String = System.Text.Encoding.ASCII.GetString(byteText, i - 5, 5).ToUpper If opCodeString = ".GSTR" Then bSearch34 = True If opCodeString = ".STRL" Then bSearch34 = True If opCodeString = "RINTI" Then bSearch34 = True If opCodeString = "RINTR" Then bSearch34 = True If bSearch34 And byteText(i) = 34 Then lStartPos = i bSearch34 = False bSearchCR = True End If If bSearchCR And byteText(i) = 13 Then bSearchCR = False ' Replace ", [LF] & Space with printable and legal characters for a Key Dim byteTemp(i - lStartPos - 3) As Byte For j As Integer = 0 To byteTemp.Length - 1 Dim byteChar As Byte = byteText(lStartPos + 1 + j) If byteChar = 10 Then byteChar = ASCII(LF_REPLACEMENT) If byteChar = 32 Then byteChar = ASCII(SPACE_REPLACEMENT) If byteChar = 34 Then byteChar = ASCII(QUOTE_REPLACEMENT) byteTemp(j) = byteChar Next ' Create and update frequency for dictionary. Replace two double-quotes with one (the first is an escape-char). Dim sLine As String = System.Text.Encoding.ASCII.GetString(byteTemp).Replace(String.Concat(QUOTE_REPLACEMENT, QUOTE_REPLACEMENT), QUOTE_REPLACEMENT) WholeTextList.Add(sLine) For Each sKey In ExtractUniquePhrases(sLine) If Not dictFreq.ContainsKey(sKey) Then dictFreq(sKey) = New phraseData dictFreq(sKey).Frequency += SubStringOccurrences(sLine, sKey) dictFreq(sKey).Key = sKey Next End If Next End If Next txtResult.Text = String.Concat(txtResult.Text, dictFreq.Count.ToString(), " phrases...", Now().AddTicks(-dtStart.Ticks).ToString("s.fff \s"), vbCrLf) txtResult.Refresh() ' Calculate cost for each phrase in dictionary dtStart = Date.Now txtResult.Text = String.Concat(txtResult.Text, "Calculating costs...") txtResult.Refresh() Dim RemoveKeysList As New List(Of String) Dim oWinner = New phraseData Dim cCurrent As Char = "" Dim cPrevious As Char = "" For Each kvpPhrase As KeyValuePair(Of String, phraseData) In dictFreq kvpPhrase.Value.PhraseCost = 0 For i As Integer = 0 To kvpPhrase.Key.Length - 1 cPrevious = cCurrent cCurrent = CChar(kvpPhrase.Key.Substring(i, 1)) If (cCurrent >= "a" And cCurrent <= "z") OrElse cCurrent = SPACE_REPLACEMENT Then ' Alphabet A0 and space kvpPhrase.Value.PhraseCost += 1 ElseIf cCurrent >= "A" And cCurrent <= "Z" Then ' Alphabet A1 kvpPhrase.Value.PhraseCost += 2 ElseIf (cCurrent >= "0" And cCurrent <= "9") OrElse ".,!?_#'/\-:()".Contains(cCurrent) OrElse cCurrent = QUOTE_REPLACEMENT OrElse cCurrent = LF_REPLACEMENT Then ' Alphabet A2 kvpPhrase.Value.PhraseCost += 2 Else ' ZSCII escape, don't count next two characters kvpPhrase.Value.PhraseCost += 4 i += 2 End If Next ' Low score, mark phrase for removal If kvpPhrase.Value.Score < LOW_SCORE_CUTOFF Then RemoveKeysList.Add(kvpPhrase.Key) End If ' Current winner If kvpPhrase.Value.Score > oWinner.Score Then oWinner = kvpPhrase.Value ElseIf kvpPhrase.Value.Score = oWinner.Score Then ' Tie-breaker: Pick one with lowest frequency = longest. Least likely to affect others. If kvpPhrase.Value.Frequency < oWinner.Frequency Then oWinner = kvpPhrase.Value End If Next ' Calculate new winner threshold (8% of highest score). iNewWinnerThreshold = oWinner.Score * 0.08 ' Pick new winner! Dim oNewWinner As phraseData = oWinner For Each oTmp As KeyValuePair(Of String, phraseData) In dictFreq If oTmp.Key.Contains(oWinner.Key) AndAlso oTmp.Key.Length > oNewWinner.Key.Length AndAlso oTmp.Value.Score > iNewWinnerThreshold Then oNewWinner = oTmp.Value End If Next oWinner = oNewWinner txtResult.Text = String.Concat(txtResult.Text, Now().AddTicks(-dtStart.Ticks).ToString("s.fff \s"), vbCrLf) txtResult.Refresh() ' Removing low scoring phrases dtStart = Date.Now txtResult.Text = String.Concat(txtResult.Text, "Removing ", RemoveKeysList.Count.ToString, " low scoring phrases...") txtResult.Refresh() For Each removeKey As String In RemoveKeysList dictFreq.Remove(removeKey) Next txtResult.Text = String.Concat(txtResult.Text, Now().AddTicks(-dtStart.Ticks).ToString("s.fff \s"), vbCrLf) txtResult.Refresh() Dim iAbbreviationNo As Integer = 1 txtResult.Text = String.Concat(txtResult.Text, PrintAbbreviation(iAbbreviationNo, oWinner.Key, dictFreq(oWinner.Key).Frequency, dictFreq(oWinner.Key).Score)) txtResult.Refresh() Do ' Recalculate WholeTextList = modifyText(WholeTextList, oWinner.Key) RemoveKeysList = New List(Of String) oWinner = New phraseData iAbbreviationNo += 1 For Each kvpPhrase As KeyValuePair(Of String, phraseData) In dictFreq ' Recalculate frequency kvpPhrase.Value.Frequency = 0 For Each textString As String In WholeTextList If textString.Contains(kvpPhrase.Key) Then kvpPhrase.Value.Frequency += SubStringOccurrences(textString, kvpPhrase.Key) End If Next ' Update current highscore and remove low scoring phrases If kvpPhrase.Value.Score > oWinner.Score Then oWinner = kvpPhrase.Value ElseIf kvpPhrase.Value.Score = oWinner.Score Then ' Tie-breaker: Pick one with lowest frequency = longest. Least likely to affect others. If kvpPhrase.Value.Frequency < oWinner.Frequency Then oWinner = kvpPhrase.Value ElseIf kvpPhrase.Value.Score < LOW_SCORE_CUTOFF Then RemoveKeysList.Add(kvpPhrase.Key) End If Next ' Remove phrases For Each sRemoveKey As String In RemoveKeysList dictFreq.Remove(sRemoveKey) Next ' Pick new winner! oNewWinner = oWinner For Each oTmp As KeyValuePair(Of String, phraseData) In dictFreq If oTmp.Key.Contains(oWinner.Key) AndAlso oTmp.Key.Length > oNewWinner.Key.Length AndAlso oTmp.Value.Score > iNewWinnerThreshold Then oNewWinner = oTmp.Value End If Next oWinner = oNewWinner txtResult.Text = String.Concat(txtResult.Text, PrintAbbreviation(iAbbreviationNo, oWinner.Key, dictFreq(oWinner.Key).Frequency, dictFreq(oWinner.Key).Score)) txtResult.Refresh() Loop Until iAbbreviationNo = 96 txtResult.Text = String.Concat(txtResult.Text, "Totaltid: ", Now().AddTicks(-dtTotalStart.Ticks).ToString("m:ss.fff \s"), vbCrLf) End Sub Private Function modifyText(sourceList As List(Of String), keyPhrase As String) As List(Of String) Dim newList As New List(Of String) Dim splitString(0) As String splitString(0) = keyPhrase For Each line As String In sourceList If line.Contains(keyPhrase) Then Dim lineParts() As String = line.Split(splitString, StringSplitOptions.RemoveEmptyEntries) For i As Integer = 0 To lineParts.Count - 1 newList.Add(lineParts(i)) Next Else newList.Add(line) End If Next Return newList End Function Private Function ExtractUniquePhrases(sText As String) As List(Of String) Dim PhraseList As New List(Of String) Dim iMaxLen As Integer = 60 If sText.Length < iMaxLen Then iMaxLen = sText.Length For i As Integer = 2 To iMaxLen For j As Integer = 0 To sText.Length - i Dim sPhrase As String = sText.Substring(j, i) PhraseList.Add(sPhrase) Next Next Return PhraseList.Distinct.ToList End Function Private Function SubStringOccurrences(ByVal TextString As String, ByVal SubString As String) As Integer Try Return TextString.Split({SubString}, StringSplitOptions.None).Length - 1 Catch ex As Exception Return 0 End Try End Function Private Function PrintAbbreviation(iAbbreviationNo As Integer, sAbbreviation As String, iFrequency As Integer, iScore As Integer) Dim sPrintAbbreviation As String = String.Concat(Chr(34), sAbbreviation.Replace(SPACE_REPLACEMENT, " ").Replace(QUOTE_REPLACEMENT, String.Concat(Chr(34), Chr(34))).Replace(LF_REPLACEMENT, vbLf), Chr(34)) Return String.Concat(" .FSTR FSTR?", iAbbreviationNo.ToString, ",", sPrintAbbreviation.PadRight(20), "; ", iFrequency.ToString.PadLeft(4), "x, saved ", iScore, vbCrLf) End Function Private Function ASCII(cChr As Char) As Integer Return System.Text.Encoding.ASCII.GetBytes(cChr)(0) End Function End Class