all repos — gemini-redirect @ e10a8c515048915835c5dd29bd2614a277fb6045

blog/ribw/build-your-own-pc/index.html (view raw)

  1<!DOCTYPE html><html lang=en><head><meta charset=utf-8><meta name=description content="Official Lonami's website"><meta name=viewport content="width=device-width, initial-scale=1.0, user-scalable=yes"><title> Build your own PC | Lonami's Blog </title><link rel=stylesheet href=/style.css><body><article><nav class=sections><ul class=left><li><a href=/>lonami's site</a><li><a href=/blog class=selected>blog</a><li><a href=/golb>golb</a></ul><div class=right><a href=https://github.com/LonamiWebs><img src=/img/github.svg alt=github></a><a href=/blog/atom.xml><img src=/img/rss.svg alt=rss></a></div></nav><main><h1 class=title>Build your own PC</h1><div class=time><p>2020-02-25T02:00:12+00:00<p>last updated 2020-03-18T09:38:46+00:00</div><p><em>…where PC obviously stands for Personal Crawler</em>.<hr><p>This post contains the source code for a very simple crawler written in Java. You can compile and run it on any file or directory, and it will calculate the frequency of all the words it finds.<h2 id=source-code>Source code</h2><p>Paste the following code in a new file called <code>Crawl.java</code>:<pre><code>import java.io.*;
  2import java.util.*;
  3import java.util.regex.Matcher;
  4import java.util.regex.Pattern;
  5
  6class Crawl {
  7	// Regex used to tokenize the words from a line of text
  8	private final static Pattern WORDS = Pattern.compile("\\w+");
  9
 10	// The file where we will cache our results
 11	private final static File INDEX_FILE = new File("index.bin");
 12
 13	// Helper method to determine if a file is a text file or not
 14	private static boolean isTextFile(File file) {
 15		String name = file.getName().toLowerCase();
 16		return name.endsWith(".txt")
 17				|| name.endsWith(".java")
 18				|| name.endsWith(".c")
 19				|| name.endsWith(".cpp")
 20				|| name.endsWith(".h")
 21				|| name.endsWith(".hpp")
 22				|| name.endsWith(".html")
 23				|| name.endsWith(".css")
 24				|| name.endsWith(".js");
 25	}
 26
 27	// Normalizes a string by converting it to lowercase and removing accents
 28	private static String normalize(String string) {
 29		return string.toLowerCase()
 30				.replace("á", "a")
 31				.replace("é", "e")
 32				.replace("í", "i")
 33				.replace("ó", "o")
 34				.replace("ú", "u");
 35	}
 36
 37	// Recursively fills the map with the count of words found on all the text files
 38	static void fillWordMap(Map&LTString, Integer> map, File root) throws IOException {
 39		// Our file queue begins with the root
 40		Queue&LTFile> fileQueue = new ArrayDeque<>();
 41		fileQueue.add(root);
 42
 43		// For as long as the queue is not empty...
 44		File file;
 45		while ((file = fileQueue.poll()) != null) {
 46			if (!file.exists() || !file.canRead()) {
 47				// ...ignore files for which we don't have permission...
 48				System.err.println("warning: cannot read file: " + file);
 49			} else if (file.isDirectory()) {
 50				// ...else if it's a directory, extend our queue with its files...
 51				File[] files = file.listFiles();
 52				if (files == null) {
 53					System.err.println("warning: cannot list dir: " + file);
 54				} else {
 55					fileQueue.addAll(Arrays.asList(files));
 56				}
 57			} else if (isTextFile(file)) {
 58				// ...otherwise, count the words in the file.
 59				countWordsInFile(map, file);
 60			}
 61		}
 62	}
 63
 64	// Counts the words in a single file and adds the count to the map.
 65	public static void countWordsInFile(Map&LTString, Integer> map, File file) throws IOException {
 66		BufferedReader reader = new BufferedReader(new FileReader(file));
 67
 68		String line;
 69		while ((line = reader.readLine()) != null) {
 70			Matcher matcher = WORDS.matcher(line);
 71			while (matcher.find()) {
 72				String token = normalize(matcher.group());
 73				Integer count = map.get(token);
 74				if (count == null) {
 75					map.put(token, 1);
 76				} else {
 77					map.put(token, count + 1);
 78				}
 79			}
 80		}
 81
 82		reader.close();
 83	}
 84
 85	// Prints the map of word count to the desired output stream.
 86	public static void printWordMap(Map&LTString, Integer> map, PrintStream writer) {
 87		List&LTString> keys = new ArrayList<>(map.keySet());
 88		Collections.sort(keys);
 89		for (String key : keys) {
 90			writer.println(key + "\t" + map.get(key));
 91		}
 92	}
 93
 94	@SuppressWarnings("unchecked")
 95	public static void main(String[] args) throws IOException, ClassNotFoundException {
 96		// Validate arguments
 97		if (args.length == 1 && args[0].equals("--help")) {
 98			System.err.println("usage: java Crawl [input]");
 99			return;
100		}
101
102		File root = new File(args.length > 0 ? args[0] : ".");
103
104		// Loading or generating the map where we aggregate the data  {word: count}
105		Map&LTString, Integer> map;
106		if (INDEX_FILE.isFile()) {
107			System.err.println("Found existing index file: " + INDEX_FILE);
108			try (ObjectInputStream ois = new ObjectInputStream(new FileInputStream(INDEX_FILE))) {
109				map = (Map&LTString, Integer>) ois.readObject();
110			}
111		} else {
112			System.err.println("Index file not found: " + INDEX_FILE + "; indexing...");
113			map = new TreeMap<>();
114			fillWordMap(map, root);
115			// Cache the results to avoid doing the work a next time
116			try (ObjectOutputStream out = new ObjectOutputStream(new FileOutputStream(INDEX_FILE))) {
117				out.writeObject(map);
118			}
119		}
120
121		// Ask the user in a loop to query for words
122		Scanner scanner = new Scanner(System.in);
123		while (true) {
124			System.out.print("Escriba palabra a consultar (o Enter para salir): ");
125			System.out.flush();
126			String line = scanner.nextLine().trim();
127			if (line.isEmpty()) {
128				break;
129			}
130
131			line = normalize(line);
132			Integer count = map.get(line);
133			if (count == null) {
134				System.out.println(String.format("La palabra \"%s\" no está presente", line));
135			} else if (count == 1) {
136				System.out.println(String.format("La palabra \"%s\" está presente 1 vez", line));
137			} else {
138				System.out.println(String.format("La palabra \"%s\" está presente %d veces", line, count));
139			}
140		}
141	}
142}
143</code></pre><p>It can be compiled and executed as follows:<pre><code>javac Crawl.java
144java Crawl
145</code></pre><p>Instead of copy-pasting the code, you may also download it as a <code>.zip</code>:<p><em>(contents removed)</em><h2 id=addendum>Addendum</h2><p>The following simple function can be used if one desires to print the contents of a file:<pre><code>public static void printFile(File file) {
146	if (isTextFile(file)) {
147		System.out.println('\n' + file.getName());
148		try (BufferedReader reader = new BufferedReader(new FileReader(file))) {
149			String line;
150			while ((line = reader.readLine()) != null) {
151				System.out.println(line);
152			}
153		} catch (FileNotFoundException ignored) {
154			System.err.println("warning: file disappeared while reading: " + file);
155		} catch (IOException e) {
156			e.printStackTrace();
157		}
158	}
159}
160</code></pre></main><footer><div><p>Share your thoughts, or simply come hang with me <a href=https://t.me/LonamiWebs><img src=/img/telegram.svg alt=Telegram></a> <a href=mailto:totufals@hotmail.com><img src=/img/mail.svg alt=Mail></a></div></footer></article><p class=abyss>Glaze into the abyss… Oh hi there!