[Closed] Fast search through text file
Hi, guys and Happy New Year!
I am trying to go as fast a possible with this task – find words in string(text file).
This code snippet will generate a string(imitating text file) with 50 000 lines(the number is correct).
(
gc()
newLineStr = "\n"
newTabStr = "\t "
--( generate string
space = " "
nl = "\n"
tab = "\t"
tabNl = "\t\n"
fmt = "%\n"
wordsArr = #("alpha", "beta", "gama", "delta", "Epsilon", "Zeta", "Eta", "Theta", "Iota", "kaPPa", "LamBda", "mU", "Nu", "xi", "omicron", "pi", "rHo", "siGma", "Tau", "UpSiLoN", "pHi", "chi", "psi", "omega")
ss = stringStream ""
for i = 1 to 50000 do
(
wordsCnt = random 10 20
wArr = for j = 1 to wordsCnt collect (wordsArr[random 1 24])
tabCnt = random 0 5
str = ""
if mod i 17 == 0 then
str = tabNl
else
(
if mod i 33 == 0 then
str = nl
else
(
for t = 1 to tabCnt do str += tab
for w in wArr do str += space + w
)
)
format fmt str to:ss
)
--)
strToCheck = toLower (ss as string)
gc()
t0 = timestamp()
fStrArr = (filterString strToCheck newLineStr)
t1 = timestamp()
format "filterString % sec.\n" ((t1-t0)/1000.0)
format "Lines: %\n" fStrArr.count
_kappa = "kappa"
_omicron = "omicron"
_upsilon = "upsilon"
wordToFindArr = #(_kappa, _omicron, _upsilon)
kappaArr = #()
omicronArr = #()
upsilonArr = #()
gc()
t0 = timestamp()
for j = 1 to fStrArr.count where (stringArr = (filterString fStrArr[j] newTabStr)).count != 0 do
(
str1 = stringArr[1]
stopLoop = false
for i in wordToFindArr where i == str1 while stopLoop == false do
(
case str1 of
(
-- "collect line number"
_kappa:
(
append kappaArr j
stopLoop = true
)
-- "collcet the line text"
omicronArr:
(
append omicronArr (trimLeft fStrArr[j] space)
stopLoop = true
)
-- "collcet the line text"
upsilonArr:
(
append upsilonArr (trimLeft fStrArr[j] space)
stopLoop = true
)
)
)
)
t1 = timestamp()
format "Find words % sec.\n" ((t1-t0)/1000.0)
format "kappaArr: %\n" kappaArr.count
format "omicronArr: %\n" omicronArr.count
format "upsilonArr: %\n" upsilonArr.count
)
You can see what the script has to do – find each line of the strToCheck which starts with one of predefined words and collect some data.
With the code above I have:
Find words 0.285 sec.
if I remove the case str1 of statement the time is almost the same.
With the real text file I am using the time to find all predefined words is about: 0.7 sec.
If I use only python(outside 3ds max) the time to do the same is about 0.18 sec.
Is there are any way to make maxscript to perform the task faster?
The collected data will be used to fill a dotnet ListView.
fastest way would be to use regex in multiline mode so you don’t have to split the text into separate lines.
if you have to iterate over each line anyway then use default singleline mode to check if the line matches the pattern with Match method. See examples section
there were some examples posted on forum
if you need to combine several regex modes, you can do it with dotnet.combineEnums
in your case the pattern for multiline mode would look like this:
regex_pattern = "^(kappa|omicron|upsilon)\b"
where ^
is the char that requires the match to be in the beginning of the line and |
char is simply the OR operator, and \b
– the word boundary. So any line starting with one of these words will return a match
I’d love to post an mxs example, but my laptop’s screen suddenly got broken andd this device has no max installed
Thank you, Serejah!
I have spent several hours searching the net for a way to get the line number when regEx is used(and the whole line’s text), but I had no success. In some cases I need to know on which line the words are written, but I could not find how to do this using regEx.
in this case you probably better iterating over the lines array so the line number is always known
although SO has some answers for similar task
upd
how long would this code take compared to the above?
line_num = 1
for line in fStrArr do
(
if matchpattern line pattern:"kappa*" then
(
append kappaArr line_num
)
else
(
line = trimLeft line space
case of
(
(matchpattern line pattern:"omicron*") : append omicronArr line
(matchpattern line pattern:"upsilon*") : append upsilonArr line
)
)
line_num += 1
)
Your code: 0.142
Mine: 0.280
Update:
Using your approach:
speed increases from 0.8 to 0.65 sec
Searching single word in 50000 lines the time went down from 2.62 sec to 2.46 sec(for 4407 occurrences).
For comparison in python the same search takes 0.088 sec for 4407 occurrences.
Still not as fast as I need.
look here, I’ve ported the code to do a quick test
https://dotnetfiddle.net/VLE9LB
pure c# version should do the job
Multiline:
5652
Time:293 ms. – could be improved with RegexOptions.Compiled option. ~200ms best timeLine by line:
5652
Time:65 ms.
After looking at the input string I realized that the pattern must be different and include"^\\s*...
in the beginning to match lines that start with the whitespaces
The C# code does not collect any data.
Does it know which word is found on the processed line?
RegEx in maxscript, in the way I use it, is not a solution
(
gc()
newLineStr = "\n"
newTabStr = "\t "
trimLeftStr = " \t"
--( generate string
space = " "
nl = "\n"
tab = "\t"
tabNl = "\t\n"
fmt = "%\n"
wordsArr = #("alpha", "beta", "gama", "delta", "Epsilon", "Zeta", "Eta", "Theta", "Iota", "kaPPa", "LamBda", "mU", "Nu", "xi", "omicron", "pi", "rHo", "siGma", "Tau", "UpSiLoN", "pHi", "chi", "psi", "omega")
seed 123
ss = stringStream ""
for i = 1 to 50000 do
(
wordsCnt = random 10 20
wArr = for j = 1 to wordsCnt collect (wordsArr[random 1 24])
tabCnt = random 0 5
str = ""
if mod i 17 == 0 then
str = tabNl
else
(
if mod i 33 == 0 then
str = nl
else
(
for t = 1 to tabCnt do str += tab
for w in wArr do str += space + w
)
)
format fmt str to:ss
)
--)
strToCheck = toLower (ss as string)
gc()
t0 = timestamp()
fStrArr = (filterString strToCheck newLineStr)
t1 = timestamp()
format "filterString % sec.\n" ((t1-t0)/1000.0)
format "Lines: %\n" fStrArr.count
_kappa = "kappa"
_omicron = "omicron"
_upsilon = "upsilon"
wordToFindArr = #(_kappa, _omicron, _upsilon)
kappaArr = #()
omicronArr = #()
upsilonArr = #()
RE_Match = (dotnetclass "system.text.regularexpressions.regex").match
RE_Pattern_Omicron = "^\\s*(omicron)\\b"
RE_Pattern_Kappa = "^\\s*(kappa)\\b"
RE_Pattern_Upsilon = "^\\s*(upsilon)\\b"
gc()
t0 = timestamp()
j = 1
for str1 in fStrArr do
(
if (RE_Match str1 RE_Pattern_Omicron).Success then
(
append kappaArr j
)
else
(
if (RE_Match str1 RE_Pattern_Kappa).Success then
(
append omicronArr j
)
else
(
if (RE_Match str1 RE_Pattern_Upsilon).Success do
(
append upsilonArr j
)
)
)
j += 1
)
t1 = timestamp()
format "Find words % sec.\n" ((t1-t0)/1000.0)
format "kappaArr: %\n" kappaArr.count
format "omicronArr: %\n" omicronArr.count
format "upsilonArr: %\n" upsilonArr.count
)
0.9 sec vs 0.14 sec for matchPattern.
Hard to help you without 3dsmax running, but here’s a bit optimized c# version, 40-50ms
Try porting it to mxs, it shouldn’t be complicated
https://dotnetfiddle.net/zrQ5r3
add regex options as in c# source to your mxs code, it should improve the performance
combine them with dotnet.combineenums and you’re good to go
If you want performance try moving loop iteration to c#
Make a dynamically compiled c# dll with a class and a method that would take a string or filepath as an input parameter and fill the arrays with the data
after process is complete you can access arrays to get the values to maxscript
smth like that
public class DocProcessor
{
List<int> _kappa = new List<int>();
List<string> _omicron = new List<string>();
List<string> _upsilon = new List<string>();
public int[] kappa {
get
{
return _kappa.ToArray();
}
}
public string[] omicron {
get
{
return _omicron.ToArray();
}
}
public string[] upsilon {
get
{
return _upsilon.ToArray();
}
}
public void ProcessDocument( string doc )
{
/* pseudocode
// clear all the lists before the start
line_index = 0
for each line in doc
new_line = trim spaces from the beginning
if kappa is match _kappa.add( line_index )
else
if omicron is match _omicron.add( new_line )
else
if upsilon is match _upsilon.add( new_line )
line_index++
*/
}
}
}
then in mxs
(
dp = dotnetobject "DocProcessor"
dp.ProcessDocument doc_string
kp = dp.kappa
print kp.count
)
not tested
Thank you.
With my zero C# knowledge this is not compiled at all:
(
fn CreateArrAssembly =
(
source = ""
source += "using System;"
source += "using Text;"
source += "using System.Collections.Generic;"
source += "System.Text.RegularExpressions;"
source += " public class DocProcessor"
source += " {"
source += " List<string> _kappa = new List<string>();"
source += " List<string> _omicron = new List<string>();"
source += " List<string> _upsilon = new List<string>();"
source += " List<int> _kappaIdx = new List<int>();"
source += " List<int> _omicronIdx = new List<int>();"
source += " List<int> _upsilonIdx = new List<int>();"
source += " public string[] kappa {"
source += " get"
source += " {"
source += " return _kappa.ToArray(); "
source += " }"
source += " }"
source += " public string[] omicron {"
source += " get"
source += " {"
source += " return _omicron.ToArray(); "
source += " }"
source += " }"
source += " public string[] upsilon {"
source += " get"
source += " {"
source += " return _upsilon.ToArray(); "
source += " }"
source += " }"
source += " public int[] kappaIdx {"
source += " get"
source += " {"
source += " return _kappaIdx .ToArray(); "
source += " }"
source += " }"
source += " public int[] omicronIdx {"
source += " get"
source += " {"
source += " return _omicronIdx .ToArray(); "
source += " }"
source += " }"
source += " public int[] upsilonIdx {"
source += " get"
source += " {"
source += " return _upsilonIdx .ToArray(); "
source += " }"
source += " }"
source += " var reOmicron = new Regex( \"^omicron\\b\", RegexOptions.CultureInvariant | RegexOptions.IgnoreCase | RegexOptions.Compiled );"
source += " var reKappa = new Regex( \"^kappa\\b\", RegexOptions.CultureInvariant | RegexOptions.IgnoreCase | RegexOptions.Compiled );"
source += " var reUpsilon = new Regex( \"^upsilon\\b\", RegexOptions.CultureInvariant | RegexOptions.IgnoreCase | RegexOptions.Compiled );"
source += " var spaces = new char[]{' ',' '};"
source += " public void ProcessDocument( string doc )"
source += " {"
source += " line_index = 0;"
source += " for each line in doc"
source += " {"
source += " line_index++;"
source += " lline = line.TrimStart( spaces );"
source += " if ( reOmicron.IsMatch( lline ) )"
source += " {"
source += " _omicron.add( line );"
source += " _omicronIdx.add( line_index );"
source += " }"
source += " else"
source += " if ( reKappa.IsMatch( lline ) )"
source += " {"
source += " _kappa.add( line );"
source += " _kappaIdx.add( line_index );"
source += " }"
source += " else"
source += " if ( reUpsilon.IsMatch( lline ) )"
source += " {"
source += " _upsilon.add( line);"
source += " _upsilonIdx.add( line_index );"
source += " }"
source += " }"
source += " }"
source += " }"
csharpProvider = dotnetobject "Microsoft.CSharp.CSharpCodeProvider"
compilerParams = dotnetobject "System.CodeDom.Compiler.CompilerParameters"
compilerParams.ReferencedAssemblies.AddRange #("System.dll")
compilerParams.GenerateInMemory = on
compilerResults = csharpProvider.CompileAssemblyFromSource compilerParams #(source)
assembly = compilerResults.CompiledAssembly
assembly.CreateInstance "DocProcessor"
)
global DocProcessorA = CreateArrAssembly()
)
Playing with your code ( https://dotnetfiddle.net/vshfOl )