From d116554cdb0f31e91fa06eac6e38cf38e6597a97 Mon Sep 17 00:00:00 2001 From: marsha <46257533+m-rsha@users.noreply.github.com> Date: Fri, 30 Dec 2022 20:37:32 -0600 Subject: [PATCH 1/2] Initial Rust junk --- .gitignore | 5 ++++ Cargo.lock | 7 +++++ Cargo.toml | 8 ++++++ build.rs | 78 +++++++++++++++++++++++++++++++++++++++++++++++++++++ src/main.rs | 23 ++++++++++++++++ 5 files changed, 121 insertions(+) create mode 100644 Cargo.lock create mode 100644 Cargo.toml create mode 100644 build.rs create mode 100644 src/main.rs diff --git a/.gitignore b/.gitignore index a15a8d1..8eca0f5 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,8 @@ /.coverage /.tox /dist + + +# Added by cargo + +/target diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..82a25e4 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,7 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "identify" +version = "0.0.1" diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..10b544f --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,8 @@ +[package] +name = "identify" +version = "0.0.1" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] diff --git a/build.rs b/build.rs new file mode 100644 index 0000000..13813fd --- /dev/null +++ b/build.rs @@ -0,0 +1,78 @@ +use std::collections::HashMap; +use std::collections::HashSet; +use std::env; +use std::fs; +use std::path::Path; + +type Dict = HashMap>; + +fn serialize_map(map: Dict, filename: &Path) { + let mut lines: Vec = ["[".into()].into(); + for (ext, tags) in map.iter() { + lines.push(format!(r#" ("{ext}", ["#)); + for tag in tags { + lines.push(format!(r#""{tag}", "#)); + } + lines.push("].into()),".into()); + } + lines.push("].into()".into()); + fs::write(filename, lines.join("\n")).unwrap(); +} + +fn main() { + // We want to create a series of hashmaps from + // identify/{extensions,interpreters}.py + // and place them in `out_dir/{extensions,interpreters}.rs` + // (or name each file after the dict, I suppose) + + let mut extensions: Dict = HashMap::new(); + let mut extensions_need_binary_check: Dict = HashMap::new(); + let mut names: Dict = HashMap::new(); + let mut interpreters: Dict = HashMap::new(); + let mut current_dict = String::new(); + + // take a python file + let mut python = fs::read_to_string("identify/extensions.py").unwrap(); + python.push_str(&fs::read_to_string("identify/interpreters.py").unwrap()); + + // read the dicts into hashmaps + for line in python.lines() { + if let Some((dict_name, _)) = line.split_once('=') { + current_dict = dict_name.trim().into(); + } + else if let Some((ext, tags)) = line.split_once(':') { + let ext = ext.trim().replace('\'', "").to_string(); + let tags: HashSet = tags.trim() + .split(',') + .map(|tag| + tag.trim().replace(|c| "'{}".contains(c), "") + ) + .filter(|tag| !tag.is_empty()) + .collect(); + + match current_dict.as_str() { + "EXTENSIONS" => extensions.insert(ext, tags), + "EXTENSIONS_NEED_BINARY_CHECK" => { + extensions_need_binary_check.insert(ext, tags) + }, + "NAMES" => names.insert(ext, tags), + "INTERPRETERS" => interpreters.insert(ext, tags), + _ => panic!("Unexpected dict name: {current_dict}"), + }; + } + } + + // write them into a rust file + let out_dir = env::var_os("OUT_DIR").unwrap(); + + let extensions_rs = Path::new(&out_dir).join("extensions.rs"); + let enbc_rs = Path::new(&out_dir).join("extensions_need_binary_check.rs"); + let names_rs = Path::new(&out_dir).join("names.rs"); + let interpreters_rs = Path::new(&out_dir).join("interpreters.rs"); + serialize_map(extensions, &extensions_rs); + serialize_map(extensions_need_binary_check, &enbc_rs); + serialize_map(names, &names_rs); + serialize_map(interpreters, &interpreters_rs); + + println!("cargo:rerun-if-changed=build.rs"); +} diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..5b217c3 --- /dev/null +++ b/src/main.rs @@ -0,0 +1,23 @@ +use std::collections::HashMap; +use std::collections::HashSet; + + +fn main() { + let extensions: HashMap<&str, HashSet<&str>> = include!( + concat!(env!("OUT_DIR"), "/extensions.rs") + ); + let extensions_need_binary_check: HashMap<&str, HashSet<&str>> = include!( + concat!(env!("OUT_DIR"), "/extensions_need_binary_check.rs") + ); + let names: HashMap<&str, HashSet<&str>> = include!( + concat!(env!("OUT_DIR"), "/names.rs") + ); + let interpreters: HashMap<&str, HashSet<&str>> = include!( + concat!(env!("OUT_DIR"), "/interpreters.rs") + ); + + println!("{:?}", extensions["bash"]); + println!("{:?}", extensions_need_binary_check["ppm"]); + println!("{:?}", names[".flake8"]); + println!("{:?}", interpreters["python3"]); +} From aed0b7e7041a05a31d17601e458000f365307d7f Mon Sep 17 00:00:00 2001 From: marsha <46257533+m-rsha@users.noreply.github.com> Date: Mon, 6 Feb 2023 17:17:40 -0600 Subject: [PATCH 2/2] blorp --- Cargo.lock | 101 +++++++++++++++++++++++++++++++ Cargo.toml | 1 + build.rs | 10 ++-- src/identify.rs | 155 ++++++++++++++++++++++++++++++++++++++++++++++++ src/main.rs | 31 ++++------ src/tags.rs | 15 +++++ 6 files changed, 290 insertions(+), 23 deletions(-) create mode 100644 src/identify.rs create mode 100644 src/tags.rs diff --git a/Cargo.lock b/Cargo.lock index 82a25e4..258e507 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5,3 +5,104 @@ version = 3 [[package]] name = "identify" version = "0.0.1" +dependencies = [ + "phf", +] + +[[package]] +name = "phf" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "928c6535de93548188ef63bb7c4036bd415cd8f36ad25af44b9789b2ee72a48c" +dependencies = [ + "phf_macros", + "phf_shared", +] + +[[package]] +name = "phf_generator" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1181c94580fa345f50f19d738aaa39c0ed30a600d95cb2d3e23f94266f14fbf" +dependencies = [ + "phf_shared", + "rand", +] + +[[package]] +name = "phf_macros" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92aacdc5f16768709a569e913f7451034034178b05bdc8acda226659a3dccc66" +dependencies = [ + "phf_generator", + "phf_shared", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "phf_shared" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e1fb5f6f826b772a8d4c0394209441e7d37cbbb967ae9c7e0e8134365c9ee676" +dependencies = [ + "siphasher", +] + +[[package]] +name = "proc-macro2" +version = "1.0.49" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57a8eca9f9c4ffde41714334dee777596264c7825420f521abc92b5b5deb63a5" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8856d8364d252a14d474036ea1358d63c9e6965c8e5c1885c18f73d70bff9c7b" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "rand" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" +dependencies = [ + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" + +[[package]] +name = "siphasher" +version = "0.3.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7bd3e3206899af3f8b12af284fafc038cc1dc2b41d1b89dd17297221c5d225de" + +[[package]] +name = "syn" +version = "1.0.107" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f4064b5b16e03ae50984a5a8ed5d4f8803e6bc1fd170a3cda91a1be4b18e3f5" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "unicode-ident" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "84a22b9f218b40614adcb3f4ff08b703773ad44fa9423e4e0d346d5db86e4ebc" diff --git a/Cargo.toml b/Cargo.toml index 10b544f..db5e0e8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,3 +6,4 @@ edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] +phf = { "version" = "0.11.1", "features" = ["macros"] } diff --git a/build.rs b/build.rs index 13813fd..0a38870 100644 --- a/build.rs +++ b/build.rs @@ -7,16 +7,16 @@ use std::path::Path; type Dict = HashMap>; fn serialize_map(map: Dict, filename: &Path) { - let mut lines: Vec = ["[".into()].into(); + let mut lines: Vec = ["phf_map!(\n".into()].into(); for (ext, tags) in map.iter() { - lines.push(format!(r#" ("{ext}", ["#)); + lines.push(format!(r#" "{ext}" => phf_set!("#)); for tag in tags { lines.push(format!(r#""{tag}", "#)); } - lines.push("].into()),".into()); + lines.push("),\n".into()); } - lines.push("].into()".into()); - fs::write(filename, lines.join("\n")).unwrap(); + lines.push(")".into()); + fs::write(filename, lines.join("")).unwrap(); } fn main() { diff --git a/src/identify.rs b/src/identify.rs new file mode 100644 index 0000000..cd6fc6d --- /dev/null +++ b/src/identify.rs @@ -0,0 +1,155 @@ +#![allow(dead_code)] +#![allow(unused_imports)] +#![allow(unused_variables)] +#![allow(unused_mut)] + +use std::collections::HashMap; +use std::collections::HashSet; +use std::ffi::OsStr; +use std::ffi::OsString; +use std::fs; +use std::os::unix::fs::FileTypeExt; // For `filetype.is_socket()` apparently +// use std::os::unix::fs::PermissionsExt; // fs::Permissions `mode()` +use std::os::unix::fs::MetadataExt; // fs::Permissions `mode()` +use std::path::Path; + +use crate::tags; + +#[derive(Debug, Eq, Hash, PartialEq)] +pub enum Tags { + Directory, + Symlink, + Socket, + File, + Executable, + NonExecutable, + Text, + Binary, +} + +pub fn tags_from_path(file_path: &str) -> HashSet { + let file = Path::new(file_path); + // TODO: Convert to Error + if !file.exists() { + panic!("{file_path} does not exist."); + } + + let metadata = fs::symlink_metadata(&file); + + if let Ok(metadata) = metadata { + // let perms = metadata.mode() & 0o777; + // println!("{:o}", perms); + if metadata.is_symlink() { + return HashSet::from([Tags::Symlink]); + } + if metadata.is_dir() { + return HashSet::from([Tags::Directory]); + } + if metadata.file_type().is_socket() { + return HashSet::from([Tags::Socket]); + } + } + + let tags = HashSet::from([Tags::File]); + // TODO + // If executable, add to `tags` + + let t = tags_from_filename(file_path); + // see if we can get tags_from_filename() and if not, + // then... weird parse_shebang stuff? + + // a lil more. reread it when not tired. + tags +} + +pub fn tags_from_filename(filename: &str) -> HashSet { + let path = Path::new(filename); + let filename = path.file_name().unwrap().to_str().unwrap().to_string(); + let ext = path.extension().unwrap().to_str().unwrap().to_lowercase(); + + let mut ret = HashSet::new(); + /* + let _: Vec<&str> = filename.split('.').collect(); + let mut parts = Vec::from([filename.clone()]); + parts.extend(filename.split('.').map(|s| s.to_string())); + + for part in parts { + if tags::NAMES.contains_key(&part) { + println!("{:?}", tags::NAMES[&part]); + // ret.push(tags::NAMES[&part]); + } + println!("Boop: {}", part); + } + */ + + if tags::EXTENSIONS.contains_key(&ext) { + ret.extend(tags::EXTENSIONS[&ext].iter().map(|s| s.to_string())); + } else if tags::EXTENSIONS_NEED_BINARY_CHECK.contains_key(&ext) { + ret.extend( + tags::EXTENSIONS_NEED_BINARY_CHECK[&ext] + .iter() + .map(|s| s.to_string()), + ); + } + /* + for part in Vec::from([ + filename.clone(), + filename.split('.').map(|s| s.to_string()).collect() + ]) { + println!("Boop: {}", part); + } + */ + + // identify.py creates a set, then, + // if filename + filename.split('.') items in extensions.NAMES, + // add to set and break + /* + let mut map = HashSet::new(); + if filename in extensions::names() { + map.insert(extension); + } + */ + + // if there's an extension, + // lowercase it, + // then if it's in extension.EXTENSIONS, add to set + // or if it's in extension.EXTENSIONS_NEED_BINARY_CHECK, add to set + // return set + + /* + let mut tags: HashSet = HashSet::new(); + if let Some(name) = path.file_name().and_then(OsStr::to_str) { + tags.insert(name.to_owned()); + } + if let Some(ext) = path.extension().and_then(OsStr::to_str) { + tags.insert(ext.to_owned()); + } + */ + // Get filename and extension + // Allow "Dockerfile.xenial" to also match "Dockerfile" + // If filename in extensions.NAMES, add + // If extension in EXTENSIONS, add + // tags + ret +} + +pub fn tags_from_interpreter(interpreter: &str) -> HashSet { + HashSet::new() +} + +pub fn is_text(/* bytes io */) -> bool { + false +} + +pub fn file_is_text(path: &str) -> bool { + false +} + +/* +pub fn parse_shebang( /* bytesio */) -> tuple of unknown size? { +} + + +pub fn parse_shebang_from_file(path: PathBuf) -> tuple of unknown size? { +} +*/ diff --git a/src/main.rs b/src/main.rs index 5b217c3..d13da55 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,23 +1,18 @@ -use std::collections::HashMap; -use std::collections::HashSet; +use std::env; +mod identify; +mod tags; fn main() { - let extensions: HashMap<&str, HashSet<&str>> = include!( - concat!(env!("OUT_DIR"), "/extensions.rs") - ); - let extensions_need_binary_check: HashMap<&str, HashSet<&str>> = include!( - concat!(env!("OUT_DIR"), "/extensions_need_binary_check.rs") - ); - let names: HashMap<&str, HashSet<&str>> = include!( - concat!(env!("OUT_DIR"), "/names.rs") - ); - let interpreters: HashMap<&str, HashSet<&str>> = include!( - concat!(env!("OUT_DIR"), "/interpreters.rs") - ); + let args: Vec = env::args().skip(1).collect(); + if args.len() < 1 { + eprintln!("Usage: identify [--filename-only] FILE"); + return; + } - println!("{:?}", extensions["bash"]); - println!("{:?}", extensions_need_binary_check["ppm"]); - println!("{:?}", names[".flake8"]); - println!("{:?}", interpreters["python3"]); + if args[0] == "--filename-only" { + println!("{:?}", identify::tags_from_filename(&args[1])); + } else { + println!("{:?}", identify::tags_from_path(&args[0])); + } } diff --git a/src/tags.rs b/src/tags.rs new file mode 100644 index 0000000..e1534e5 --- /dev/null +++ b/src/tags.rs @@ -0,0 +1,15 @@ +use phf::phf_map; +use phf::phf_set; + + +pub const NAMES: phf::Map<&str, phf::Set<&str>> = + include!(concat!(env!("OUT_DIR"), "/names.rs")); + +pub const EXTENSIONS: phf::Map<&str, phf::Set<&str>> = + include!(concat!(env!("OUT_DIR"), "/extensions.rs")); + +pub const EXTENSIONS_NEED_BINARY_CHECK: phf::Map<&str, phf::Set<&str>> = + include!(concat!(env!("OUT_DIR"), "/extensions_need_binary_check.rs")); + +pub const INTERPRETERS: phf::Map<&str, phf::Set<&str>> = + include!(concat!(env!("OUT_DIR"), "/interpreters.rs"));