Notifications
Clear all

[Closed] Fast search through text file

yes. it’s similar to findItem but has some extra options. Anyway see the documentation for more details.

Dictionary is always a pair <key, value> … key is usually something simply ‘hashable’ (string, index, etc…) for quick search, but the value can be any object. for example, a pair as well

Dictionary actually beats the regex version and manual extraction of the first word improves perfomance even more
~65ms

Perhaps there’s no reason to introduce new classes like Pair and simply use public Dictionary<string, List<string>> data; lists of strings as a value type

public class Pair
{
	public int count;
	public List<string> lines;
	
	public Pair( string line )
	{
		count = 1;
		lines = new List<string>(){ line };
	}
}

public class TextProcessor
{
	public Dictionary<string, Pair> data;
	public string[] keys;
	public Pair[] pairs;
	
	public void ProcessFile(string file , string[] words)
	{	
		data = new Dictionary<string, Pair>();
		
		var spaces = new char[]{' ','	','\r','\n'};
		string[] lines = File.ReadAllLines(file);
		if (lines != null)
		{
			string word;
			
			foreach(string line in lines)
			{
				//var word = line.TrimStart(spaces).Split()[0].ToLower();
				
				word = line.TrimStart(spaces);
				int len  = word.IndexOfAny( spaces );
				word = word.Substring( 0, len < 0 ? word.Length : len ).ToLower();
				
				if (data.ContainsKey(word))
				{
					data[word].count++;
					data[word].lines.Add( line );
				}
				else
				{
					data[word] = new Pair( line );						
				}
			}
		}
		keys = new string[data.Keys.Count];
		data.Keys.CopyTo(keys, 0);				
		pairs = new Pair[data.Values.Count];
		data.Values.CopyTo(pairs, 0);
	}
	
}

Thank you.
Will test it.
Here is my attempt(does not work): https://dotnetfiddle.net/nfWKip

This is the correct way to get the data?

format "str:%\n" tp.pairs[2].lines.item[1]

The line numbers are not collected, right?

I didn’t test it in max, but you can call .lines.ToArray() and get the mxs array with all the values
If you need line numbers just add another list of integers to the pair class, guess it should be called Line class instead

I will test this: https://dotnetfiddle.net/LA5U5B
with this fix:
2023-01-17%2016_10_54

line, lines all around, feel bad at naming things

        public class Line
		{
			public int count;
			public List<string> lines;
			public List<int> indexes;
			
			public Line()
			{
				count = 0;
				lines   = new List<string>(){};
				indexes = new List<int>();
			}
			
			
			public Line( string line, int index )
			{
				count   = 1;
				lines   = new List<string>(){ line };
				indexes = new List<int>( index );
			}
			
			public void AddLine( string line, int index )
			{
				lines.Add( line );
				indexes.Add( index );
				count++;
			}
		}

		public class TextProcessor
		{
			public Dictionary<string, Line> data;
			public string[] keys;
			public Line[] lines;
			
			public void ProcessFile(string file , string[] words)
			{	
				data = new Dictionary<string, Line>();
				
				var spaces = new char[]{' ','	','\r','\n'};
				string[] lines = File.ReadAllLines(file);
				if (lines != null)
				{
					int line_index = 0;
					
					string word;
					
					foreach(string line in lines)
					{
					 	line_index++;
						
						// #1
						//word = line.TrimStart(spaces).Split()[0].ToLower();
						
						// #2
						word = line.TrimStart(spaces);
						int len  = word.IndexOfAny( spaces );
						word = word.Substring( 0, len < 0 ? word.Length : len ).ToLower();
						
						// #3
						//var chars = line.ToCharArray();						
						//int f1 = Array.FindIndex(chars,     x => !char.IsWhiteSpace(x));						
						//if ( f1 < 0 ) continue;						
						//int f2 = Array.FindIndex(chars, f1, x => char.IsWhiteSpace(x));
						//word = line.Substring( f1, f2 - f1 ).ToLower();
						
												
						if (data.ContainsKey(word))
						{
							data[word].AddLine( line, line_index );
						}
						else
						{
							data[word] = new Line( line, line_index );						
						}
					}
				}
				keys = new string[data.Keys.Count];
				data.Keys.CopyTo(keys, 0);				
				this.lines = new Line[data.Values.Count];
				data.Values.CopyTo(this.lines, 0);
			}
			
		}

Strange. indexes are less than the other collected data. The first line index is not collected.

tp.keys[1]: beta

tp.lines[1].count: 1856
tp.lines[1].count: 1856

tp.lines[1].indexes.count: 1855

tp.lines[1].lines.count: 1856

I am using this inside your code to filter only the passed words:

if (Array.IndexOf(words, word) > -1)
						{	
							if (data.ContainsKey(word))
							{
								data[word].AddLine( line, line_index );
							}
							else
							{
								data[word] = new Line( line, line_index );						
							}
						}
2 Replies
(@serejah)
Joined: 10 months ago

Posts: 0

don’t do this.
instead initialize the dict with known words and add lines info only in case of data.ContainsKey(word)

...
foreach( var w in words ) data[w] = new Line() // preinitializing the dict
...
(@serejah)
Joined: 10 months ago

Posts: 0

I still don’t understand why that’s the case, but changing to this solves the issue
upd
found it at last, was putting index inside parens and not inside curlies

            public Line( string line, int index )
			{
				count   = 1;
				lines   = new List<string>(){ line };
				indexes = new List<int>(){ index };
			}

This way works. Thank you.

public void ProcessFile(string file , string[] words)
			{	
				data = new Dictionary<string, Line>();				
				foreach( var w in words ) data[w] = new Line();
Page 7 / 10