1use crate::traits::math::Vector;
326use crate::util::{print::LowerExpWithPlus, useful::tab};
327#[cfg(feature = "parquet")]
328use arrow::datatypes::{
329 Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, UInt16Type, UInt32Type,
330 UInt64Type, UInt8Type,
331};
332use std::cmp::{max, min};
333#[cfg(feature = "csv")]
334use std::collections::HashMap;
335#[cfg(feature = "parquet")]
336use indexmap::IndexMap;
337#[cfg(any(feature = "csv", feature = "nc", feature = "parquet"))]
338use std::error::Error;
339use std::fmt;
340use std::ops::{Index, IndexMut};
341#[cfg(feature = "parquet")]
342use std::sync::Arc;
343use DType::{Bool, Char, Str, F32, F64, I16, I32, I64, I8, ISIZE, U16, U32, U64, U8, USIZE};
344
345#[cfg(feature = "parquet")]
346use arrow::{
347 array::{Array, BooleanArray, PrimitiveArray, StringArray},
348 datatypes::{DataType, Field, Schema},
349};
350#[cfg(feature = "csv")]
351use csv::{ReaderBuilder, WriterBuilder};
352#[cfg(feature = "nc")]
353use netcdf::{
354 types::VariableType,
355 variable::{Variable, VariableMut},
356 Numeric,
357};
358#[cfg(feature = "parquet")]
359use parquet::{
360 arrow::arrow_reader::ParquetRecordBatchReaderBuilder,
361 arrow::arrow_writer::compute_leaves,
362 arrow::arrow_writer::get_column_writers,
363 arrow::arrow_writer::ArrowLeafColumn,
364 arrow::ArrowSchemaConverter,
365 basic::Compression,
366 file::properties::WriterProperties,
367 file::writer::{SerializedFileWriter, SerializedRowGroupWriter},
368};
369
370#[derive(Debug, Copy, Clone, Eq, PartialEq)]
376pub enum DType {
377 USIZE,
378 U8,
379 U16,
380 U32,
381 U64,
382 ISIZE,
383 I8,
384 I16,
385 I32,
386 I64,
387 F32,
388 F64,
389 Bool,
390 Str,
391 Char,
392}
393
394#[derive(Debug, Clone, PartialEq)]
396pub enum DTypeArray {
397 USIZE(Vec<usize>),
398 U8(Vec<u8>),
399 U16(Vec<u16>),
400 U32(Vec<u32>),
401 U64(Vec<u64>),
402 ISIZE(Vec<isize>),
403 I8(Vec<i8>),
404 I16(Vec<i16>),
405 I32(Vec<i32>),
406 I64(Vec<i64>),
407 F32(Vec<f32>),
408 F64(Vec<f64>),
409 Bool(Vec<bool>),
410 Str(Vec<String>),
411 Char(Vec<char>),
412}
413
414#[derive(Debug, Clone, PartialEq, PartialOrd)]
416pub enum DTypeValue {
417 USIZE(usize),
418 U8(u8),
419 U16(u16),
420 U32(u32),
421 U64(u64),
422 ISIZE(isize),
423 I8(i8),
424 I16(i16),
425 I32(i32),
426 I64(i64),
427 F32(f32),
428 F64(f64),
429 Bool(bool),
430 Str(String),
431 Char(char),
432}
433
434#[derive(Debug, Clone, PartialEq)]
469pub struct DataFrame {
470 pub data: Vec<Series>,
471 pub ics: Vec<String>,
472}
473
474#[derive(Debug, Clone, PartialEq)]
495pub struct Series {
496 pub values: DTypeArray,
497 pub dtype: DType,
498}
499
500#[derive(Debug, Clone, PartialEq)]
502pub struct Scalar {
503 pub value: DTypeValue,
504 pub dtype: DType,
505}
506
507pub trait TypedScalar<T> {
511 fn new(s: T) -> Self
512 where
513 Self: Sized;
514 fn unwrap(self) -> T;
515}
516
517pub trait TypedVector<T> {
518 fn new(v: Vec<T>) -> Self;
519 fn to_vec(&self) -> Vec<T>;
520 fn as_slice(&self) -> &[T];
521 fn as_slice_mut(&mut self) -> &mut [T];
522 fn at_raw(&self, i: usize) -> T;
523 fn push(&mut self, elem: T);
524 fn map<F: Fn(T) -> T>(&self, f: F) -> Self;
525 fn mut_map<F: Fn(&mut T)>(&mut self, f: F);
526 fn fold<F: Fn(T, T) -> T>(&self, init: T, f: F) -> T;
527 fn filter<F: Fn(&T) -> bool>(&self, f: F) -> Self;
528 fn take(&self, n: usize) -> Self;
529 fn skip(&self, n: usize) -> Self;
530 fn take_while<F: Fn(&T) -> bool>(&self, f: F) -> Self;
531 fn skip_while<F: Fn(&T) -> bool>(&self, f: F) -> Self;
532 fn zip_with<F: Fn(T, T) -> T>(&self, f: F, other: &Self) -> Self;
533}
534
535macro_rules! impl_typed_scalar {
539 ($type:ty, $dtype:ident) => {
540 impl TypedScalar<$type> for Scalar {
541 fn new(s: $type) -> Self {
542 Self {
543 value: DTypeValue::$dtype(s),
544 dtype: DType::$dtype,
545 }
546 }
547
548 fn unwrap(self) -> $type {
549 match self.value {
550 DTypeValue::$dtype(s) => s,
551 _ => panic!("Can't unwrap {:?} value", $dtype),
552 }
553 }
554 }
555 };
556}
557
558macro_rules! impl_typed_vector {
559 ($type:ty, $dtype:ident) => {
560 impl TypedVector<$type> for Series {
561 fn new(v: Vec<$type>) -> Self {
562 Self {
563 values: DTypeArray::$dtype(v),
564 dtype: DType::$dtype,
565 }
566 }
567
568 fn to_vec(&self) -> Vec<$type> {
569 self.as_slice().to_vec()
570 }
571
572 fn as_slice(&self) -> &[$type] {
573 match &self.values {
574 DTypeArray::$dtype(v) => v,
575 _ => panic!("Can't convert to {:?} vector", $dtype),
576 }
577 }
578
579 fn as_slice_mut(&mut self) -> &mut [$type] {
580 match &mut self.values {
581 DTypeArray::$dtype(v) => v,
582 _ => panic!("Can't convert to {:?} vector", $dtype),
583 }
584 }
585
586 fn at_raw(&self, i: usize) -> $type {
587 let v: &[$type] = self.as_slice();
588 v[i].clone()
589 }
590
591 fn push(&mut self, elem: $type) {
592 let v: &mut Vec<$type> = match &mut self.values {
593 DTypeArray::$dtype(v) => v,
594 _ => panic!("Can't convert to {:?} vector", $dtype),
595 };
596 v.push(elem);
597 }
598
599 fn map<F: Fn($type) -> $type>(&self, f: F) -> Self {
600 let v: Vec<$type> = self.to_vec();
601 Series::new(v.into_iter().map(f).collect::<Vec<$type>>())
602 }
603
604 fn mut_map<F: Fn(&mut $type)>(&mut self, f: F) {
605 let v = self.as_slice_mut();
606 v.iter_mut().for_each(f);
607 }
608
609 fn fold<F: Fn($type, $type) -> $type>(&self, init: $type, f: F) -> $type {
610 let v: Vec<$type> = self.to_vec();
611 v.into_iter().fold(init, f)
612 }
613
614 fn filter<F: Fn(&$type) -> bool>(&self, f: F) -> Self {
615 let v: Vec<$type> = self.to_vec();
616 Series::new(v.into_iter().filter(|x| f(x)).collect::<Vec<$type>>())
617 }
618
619 fn take(&self, n: usize) -> Self {
620 let v: Vec<$type> = self.to_vec();
621 Series::new(v.into_iter().take(n).collect::<Vec<$type>>())
622 }
623
624 fn skip(&self, n: usize) -> Self {
625 let v: Vec<$type> = self.to_vec();
626 Series::new(v.into_iter().skip(n).collect::<Vec<$type>>())
627 }
628
629 fn take_while<F: Fn(&$type) -> bool>(&self, f: F) -> Self {
630 let v: Vec<$type> = self.to_vec();
631 Series::new(v.into_iter().take_while(|x| f(x)).collect::<Vec<$type>>())
632 }
633
634 fn skip_while<F: Fn(&$type) -> bool>(&self, f: F) -> Self {
635 let v: Vec<$type> = self.to_vec();
636 Series::new(v.into_iter().skip_while(|x| f(x)).collect::<Vec<$type>>())
637 }
638
639 fn zip_with<F: Fn($type, $type) -> $type>(&self, f: F, other: &Self) -> Self {
640 let v: Vec<$type> = self.to_vec();
641 let w: Vec<$type> = other.to_vec();
642 Series::new(
643 v.into_iter()
644 .zip(w.into_iter())
645 .map(|(x, y)| f(x, y))
646 .collect::<Vec<$type>>(),
647 )
648 }
649 }
650 };
651}
652
653macro_rules! dtype_case {
654 ($type:ty, $value:expr, $wrapper: expr) => {{
655 let x: $type = $value;
656 $wrapper(x)
657 }};
658}
659
660macro_rules! dtype_match {
661 ($dtype:expr, $value:expr, $wrapper:expr) => {{
662 match $dtype {
663 USIZE => dtype_case!(usize, $value, $wrapper),
664 U8 => dtype_case!(u8, $value, $wrapper),
665 U16 => dtype_case!(u16, $value, $wrapper),
666 U32 => dtype_case!(u32, $value, $wrapper),
667 U64 => dtype_case!(u64, $value, $wrapper),
668 ISIZE => dtype_case!(isize, $value, $wrapper),
669 I8 => dtype_case!(i8, $value, $wrapper),
670 I16 => dtype_case!(i16, $value, $wrapper),
671 I32 => dtype_case!(i32, $value, $wrapper),
672 I64 => dtype_case!(i64, $value, $wrapper),
673 F32 => dtype_case!(f32, $value, $wrapper),
674 F64 => dtype_case!(f64, $value, $wrapper),
675 Bool => dtype_case!(bool, $value, $wrapper),
676 Char => dtype_case!(char, $value, $wrapper),
677 Str => dtype_case!(String, $value, $wrapper),
678 }
679 }};
680
681 ($dtype:expr, $value:expr, $wrapper:expr; $functor:ident) => {{
682 match $dtype {
683 USIZE => dtype_case!($functor<usize>, $value, $wrapper),
684 U8 => dtype_case!($functor<u8>, $value, $wrapper),
685 U16 => dtype_case!($functor<u16>, $value, $wrapper),
686 U32 => dtype_case!($functor<u32>, $value, $wrapper),
687 U64 => dtype_case!($functor<u64>, $value, $wrapper),
688 ISIZE => dtype_case!($functor<isize>, $value, $wrapper),
689 I8 => dtype_case!($functor<i8>, $value, $wrapper),
690 I16 => dtype_case!($functor<i16>, $value, $wrapper),
691 I32 => dtype_case!($functor<i32>, $value, $wrapper),
692 I64 => dtype_case!($functor<i64>, $value, $wrapper),
693 F32 => dtype_case!($functor<f32>, $value, $wrapper),
694 F64 => dtype_case!($functor<f64>, $value, $wrapper),
695 Bool => dtype_case!($functor<bool>, $value, $wrapper),
696 Char => dtype_case!($functor<char>, $value, $wrapper),
697 Str => dtype_case!($functor<String>, $value, $wrapper),
698 }
699 }};
700
701 (N; $dtype:expr, $value:expr, $wrapper:expr) => {{
702 match $dtype {
703 U8 => dtype_case!(u8, $value, $wrapper),
704 U16 => dtype_case!(u16, $value, $wrapper),
705 U32 => dtype_case!(u32, $value, $wrapper),
706 U64 => dtype_case!(u64, $value, $wrapper),
707 I8 => dtype_case!(i8, $value, $wrapper),
708 I16 => dtype_case!(i16, $value, $wrapper),
709 I32 => dtype_case!(i32, $value, $wrapper),
710 I64 => dtype_case!(i64, $value, $wrapper),
711 F32 => dtype_case!(f32, $value, $wrapper),
712 F64 => dtype_case!(f64, $value, $wrapper),
713 _ => panic!("Can't use {} to numeric", $dtype);
714 }
715 }};
716
717 (N; $dtype:expr, $value:expr, $wrapper:expr; $functor:ident) => {{
718 match $dtype {
719 U8 => dtype_case!($functor<u8>, $value, $wrapper),
720 U16 => dtype_case!($functor<u16>, $value, $wrapper),
721 U32 => dtype_case!($functor<u32>, $value, $wrapper),
722 U64 => dtype_case!($functor<u64>, $value, $wrapper),
723 I8 => dtype_case!($functor<i8>, $value, $wrapper),
724 I16 => dtype_case!($functor<i16>, $value, $wrapper),
725 I32 => dtype_case!($functor<i32>, $value, $wrapper),
726 I64 => dtype_case!($functor<i64>, $value, $wrapper),
727 F32 => dtype_case!($functor<f32>, $value, $wrapper),
728 F64 => dtype_case!($functor<f64>, $value, $wrapper),
729 _ => panic!("Can't use {} to numeric", $dtype),
730 }
731 }};
732}
733
734macro_rules! set_space {
735 ($elem:expr) => {{
736 match $elem.dtype {
737 F32 => {
738 let elem: f32 = $elem.unwrap();
739 let st1 = elem.fmt_lower_exp(2);
740 let st2 = elem.to_string();
741
742 if st1.len() < st2.len() {
743 st1
744 } else {
745 st2
746 }
747 }
748 F64 => {
749 let elem: f64 = $elem.unwrap();
750 let st1 = elem.fmt_lower_exp(2);
751 let st2 = elem.to_string();
752
753 if st1.len() < st2.len() {
754 st1
755 } else {
756 st2
757 }
758 }
759 _ => $elem.to_string(),
760 }
761 }};
762
763 ($elem:expr, $space:expr) => {{
764 match $elem.dtype {
765 F32 => {
766 let elem: f32 = $elem.unwrap();
767 $space = max(
768 $space,
769 min(elem.fmt_lower_exp(2).len(), elem.to_string().len()),
770 );
771 }
772 F64 => {
773 let elem: f64 = $elem.unwrap();
774 $space = max(
775 $space,
776 min(elem.fmt_lower_exp(2).len(), elem.to_string().len()),
777 );
778 }
779 _ => {
780 $space = max($space, $elem.to_string().len());
781 }
782 }
783 }};
784}
785
786macro_rules! format_float_vec {
787 ($self:expr) => {{
788 let mut result = String::new();
789 result.push_str("[");
790 for i in 0..$self.len() {
791 let st1 = $self[i].fmt_lower_exp(2);
792 let st2 = $self[i].to_string();
793 let st = if st1.len() < st2.len() { st1 } else { st2 };
794 result.push_str(&st);
795 if i == $self.len() - 1 {
796 break;
797 }
798 result.push_str(", ");
799 }
800 result.push_str("]");
801 result
802 }};
803}
804
805macro_rules! type_cast_vec {
807 ($ty1:ty, $ty2:ty, $to_vec:expr, $wrapper:expr) => {{
808 let y: Vec<$ty1> = $to_vec;
809 let x: Vec<$ty2> = y.into_iter().map(|x| x as $ty2).collect();
810 $wrapper(x)
811 }};
812}
813
814macro_rules! string_cast_vec {
815 ($ty1:ty, $to_vec:expr, $wrapper:expr) => {{
816 let y: Vec<$ty1> = $to_vec;
817 let x: Vec<String> = y.into_iter().map(|x| x.to_string()).collect();
818 $wrapper(x)
819 }};
820}
821
822macro_rules! type_parse_vec {
823 ($ty2:ty, $to_vec:expr, $wrapper:expr) => {{
824 let y: Vec<String> = $to_vec.to_vec();
825 let x: Vec<$ty2> = y.into_iter().map(|x| x.parse().unwrap()).collect();
826 $wrapper(x)
827 }};
828}
829
830macro_rules! dtype_parse_vec_part {
831 ($dt2:expr, $to_vec:expr, $wrapper:expr) => {{
832 match $dt2 {
833 USIZE => type_parse_vec!(usize, $to_vec, $wrapper),
834 U8 => type_parse_vec!(u8, $to_vec, $wrapper),
835 U16 => type_parse_vec!(u16, $to_vec, $wrapper),
836 U32 => type_parse_vec!(u32, $to_vec, $wrapper),
837 U64 => type_parse_vec!(u64, $to_vec, $wrapper),
838 ISIZE => type_parse_vec!(isize, $to_vec, $wrapper),
839 I8 => type_parse_vec!(i8, $to_vec, $wrapper),
840 I16 => type_parse_vec!(i16, $to_vec, $wrapper),
841 I32 => type_parse_vec!(i32, $to_vec, $wrapper),
842 I64 => type_parse_vec!(i64, $to_vec, $wrapper),
843 F32 => type_parse_vec!(f32, $to_vec, $wrapper),
844 F64 => type_parse_vec!(f64, $to_vec, $wrapper),
845 Bool => type_parse_vec!(bool, $to_vec, $wrapper),
846 Char => type_parse_vec!(char, $to_vec, $wrapper),
847 Str => type_parse_vec!(String, $to_vec, $wrapper),
848 }
849 }};
850}
851
852macro_rules! dtype_cast_vec_part {
853 ($ty1:ty, $dt2:expr, $to_vec:expr, $wrapper:expr) => {{
854 match $dt2 {
855 USIZE => type_cast_vec!($ty1, usize, $to_vec, $wrapper),
856 U8 => type_cast_vec!($ty1, u8, $to_vec, $wrapper),
857 U16 => type_cast_vec!($ty1, u16, $to_vec, $wrapper),
858 U32 => type_cast_vec!($ty1, u32, $to_vec, $wrapper),
859 U64 => type_cast_vec!($ty1, u64, $to_vec, $wrapper),
860 ISIZE => type_cast_vec!($ty1, isize, $to_vec, $wrapper),
861 I8 => type_cast_vec!($ty1, i8, $to_vec, $wrapper),
862 I16 => type_cast_vec!($ty1, i16, $to_vec, $wrapper),
863 I32 => type_cast_vec!($ty1, i32, $to_vec, $wrapper),
864 I64 => type_cast_vec!($ty1, i64, $to_vec, $wrapper),
865 F32 => type_cast_vec!($ty1, f32, $to_vec, $wrapper),
866 F64 => type_cast_vec!($ty1, f64, $to_vec, $wrapper),
867 Str => string_cast_vec!($ty1, $to_vec, $wrapper),
868 _ => panic!("Can't convert to {}", $dt2),
869 }
870 }};
871}
872
873macro_rules! dtype_cast_vec {
874 ($dt1:expr, $dt2:expr, $to_vec:expr, $wrapper:expr) => {{
875 match $dt1 {
876 USIZE => dtype_cast_vec_part!(usize, $dt2, $to_vec, $wrapper),
877 U8 => match $dt2 {
878 Bool => {
879 let y: Vec<u8> = $to_vec;
880 let x: Vec<bool> = y.into_iter().map(|x| x != 0).collect();
881 $wrapper(x)
882 }
883 Char => {
884 let y: Vec<u8> = $to_vec;
885 let x: Vec<char> = y.into_iter().map(|x| x as char).collect();
886 $wrapper(x)
887 }
888 _ => dtype_cast_vec_part!(u8, $dt2, $to_vec, $wrapper),
889 },
890 U16 => dtype_cast_vec_part!(u16, $dt2, $to_vec, $wrapper),
891 U32 => dtype_cast_vec_part!(u32, $dt2, $to_vec, $wrapper),
892 U64 => dtype_cast_vec_part!(u64, $dt2, $to_vec, $wrapper),
893 ISIZE => dtype_cast_vec_part!(isize, $dt2, $to_vec, $wrapper),
894 I8 => dtype_cast_vec_part!(i8, $dt2, $to_vec, $wrapper),
895 I16 => dtype_cast_vec_part!(i16, $dt2, $to_vec, $wrapper),
896 I32 => dtype_cast_vec_part!(i32, $dt2, $to_vec, $wrapper),
897 I64 => dtype_cast_vec_part!(i64, $dt2, $to_vec, $wrapper),
898 F32 => dtype_cast_vec_part!(f32, $dt2, $to_vec, $wrapper),
899 F64 => dtype_cast_vec_part!(f64, $dt2, $to_vec, $wrapper),
900 Str => dtype_parse_vec_part!($dt2, $to_vec, $wrapper),
901 Char => match $dt2 {
902 Str => string_cast_vec!(char, $to_vec, $wrapper),
903 U8 => {
904 let y: Vec<char> = $to_vec;
905 let x: Vec<u8> = y.into_iter().map(|x| x as u8).collect();
906 $wrapper(x)
907 }
908 _ => panic!("Can't convert char type to {}", $dt2),
909 },
910 Bool => match $dt2 {
911 U8 => {
912 let y: Vec<bool> = $to_vec;
913 let x: Vec<u8> = y.into_iter().map(|x| if x { 1 } else { 0 }).collect();
914 $wrapper(x)
915 }
916 Bool => {
917 let y: Vec<bool> = $to_vec;
918 $wrapper(y)
919 }
920 _ => panic!("Can't convert bool type to {}", $dt2),
921 },
922 }
923 }};
924}
925
926fn len<T>(x: Vec<T>) -> usize {
927 x.len()
928}
929
930fn to_string<T: fmt::Display>(x: T) -> String {
931 x.to_string()
932}
933
934#[cfg(feature = "nc")]
935fn dtype_to_vtype(dt: DType) -> netcdf::types::BasicType {
936 match dt {
937 USIZE => netcdf::types::BasicType::Uint64,
938 U8 => netcdf::types::BasicType::Ubyte,
939 U16 => netcdf::types::BasicType::Ushort,
940 U32 => netcdf::types::BasicType::Uint,
941 U64 => netcdf::types::BasicType::Uint64,
942 ISIZE => netcdf::types::BasicType::Int64,
943 I8 => netcdf::types::BasicType::Byte,
944 I16 => netcdf::types::BasicType::Short,
945 I32 => netcdf::types::BasicType::Int,
946 I64 => netcdf::types::BasicType::Int64,
947 F32 => netcdf::types::BasicType::Float,
948 F64 => netcdf::types::BasicType::Double,
949 Bool => netcdf::types::BasicType::Ubyte,
950 Char => netcdf::types::BasicType::Ubyte,
951 _ => panic!("Can't convert type to netcdf::types::BasicType"),
952 }
953}
954
955#[cfg(feature = "nc")]
956fn vtype_to_dtype(dv: netcdf::types::BasicType) -> DType {
957 match dv {
958 netcdf::types::BasicType::Ubyte => U8,
959 netcdf::types::BasicType::Ushort => U16,
960 netcdf::types::BasicType::Uint => U32,
961 netcdf::types::BasicType::Uint64 => U64,
962 netcdf::types::BasicType::Byte => I8,
963 netcdf::types::BasicType::Short => I16,
964 netcdf::types::BasicType::Int => I32,
965 netcdf::types::BasicType::Int64 => I64,
966 netcdf::types::BasicType::Float => F32,
967 netcdf::types::BasicType::Double => F64,
968 netcdf::types::BasicType::Char => Char,
969 }
970}
971
972#[cfg(feature = "nc")]
973fn nc_put_value<T: Numeric>(var: &mut VariableMut, v: Vec<T>) -> Result<(), netcdf::error::Error> {
974 var.put_values(&v, None, None)
975}
976
977#[cfg(feature = "nc")]
978fn nc_read_value<T: Numeric + Default + Clone>(
979 val: &Variable,
980 v: Vec<T>,
981) -> Result<Series, netcdf::error::Error>
982where
983 Series: TypedVector<T>,
984{
985 let mut v = v;
986 v.resize_with(val.len(), Default::default);
987 val.values_to(&mut v, None, None)?;
988 Ok(Series::new(v.clone()))
989}
990
991#[cfg(feature = "parquet")]
992fn dtype_to_arrow(dt: DType) -> DataType {
993 match dt {
994 USIZE => DataType::UInt64,
995 U8 => DataType::UInt8,
996 U16 => DataType::UInt16,
997 U32 => DataType::UInt32,
998 U64 => DataType::UInt64,
999 ISIZE => DataType::Int64,
1000 I8 => DataType::Int8,
1001 I16 => DataType::Int16,
1002 I32 => DataType::Int32,
1003 I64 => DataType::Int64,
1004 F32 => DataType::Float32,
1005 F64 => DataType::Float64,
1006 Bool => DataType::Boolean,
1007 Str => DataType::Utf8,
1008 Char => DataType::Utf8,
1009 }
1010}
1011
1012#[cfg(feature = "parquet")]
1013fn arrow_to_dtype(dt: DataType) -> DType {
1014 match dt {
1015 DataType::Boolean => Bool,
1016 DataType::Int8 => I8,
1017 DataType::Int16 => I16,
1018 DataType::Int32 => I32,
1019 DataType::Int64 => I64,
1020 DataType::UInt8 => U8,
1021 DataType::UInt16 => U16,
1022 DataType::UInt32 => U32,
1023 DataType::UInt64 => U64,
1024 DataType::Float32 => F32,
1026 DataType::Float64 => F64,
1027 DataType::Utf8 => Str,
1028 _ => unimplemented!(),
1029 }
1030}
1031
1032#[cfg(feature = "parquet")]
1033macro_rules! dtype_case_to_arrow {
1034 ($ty:ty, $to_arr:expr, $value:expr, $chunk_vec:expr; $length:expr) => {{
1035 let v: Vec<$ty> = $value;
1036 let v_wrap = (0usize..$length)
1037 .map(|i| {
1038 if i < v.len() {
1039 Some(v[i].clone())
1040 } else {
1041 None
1042 }
1043 })
1044 .collect::<Vec<_>>();
1045 let arr = $to_arr(v_wrap);
1046 $chunk_vec.push(Arc::from(arr) as Arc<dyn Array>);
1047 }};
1048}
1049
1050#[cfg(feature = "parquet")]
1051macro_rules! dtype_match_to_arrow {
1052 ($dtype:expr, $value:expr, $chunk_vec:expr; $length:expr) => {{
1053 match $dtype {
1054 Bool => dtype_case_to_arrow!(bool, BooleanArray::from, $value, $chunk_vec; $length),
1055 Str => dtype_case_to_arrow!(String, StringArray::from, $value, $chunk_vec; $length),
1056 Char => {
1057 let v: Vec<char> = $value;
1058 let v = v.into_iter().map(|t| t.to_string()).collect::<Vec<_>>();
1059 dtype_case_to_arrow!(String, StringArray::from, v, $chunk_vec; $length)
1060 }
1061 USIZE => dtype_case_to_arrow!(u64, PrimitiveArray::<UInt64Type>::from, $value, $chunk_vec; $length),
1062 U8 => dtype_case_to_arrow!(u8, PrimitiveArray::<UInt8Type>::from, $value, $chunk_vec; $length),
1063 U16 => dtype_case_to_arrow!(u16, PrimitiveArray::<UInt16Type>::from, $value, $chunk_vec; $length),
1064 U32 => dtype_case_to_arrow!(u32, PrimitiveArray::<UInt32Type>::from, $value, $chunk_vec; $length),
1065 U64 => dtype_case_to_arrow!(u64, PrimitiveArray::<UInt64Type>::from, $value, $chunk_vec; $length),
1066 ISIZE => dtype_case_to_arrow!(i64, PrimitiveArray::<Int64Type>::from, $value, $chunk_vec; $length),
1067 I8 => dtype_case_to_arrow!(i8, PrimitiveArray::<Int8Type>::from, $value, $chunk_vec; $length),
1068 I16 => dtype_case_to_arrow!(i16, PrimitiveArray::<Int16Type>::from, $value, $chunk_vec; $length),
1069 I32 => dtype_case_to_arrow!(i32, PrimitiveArray::<Int32Type>::from, $value, $chunk_vec; $length),
1070 I64 => dtype_case_to_arrow!(i64, PrimitiveArray::<Int64Type>::from, $value, $chunk_vec; $length),
1071 F32 => dtype_case_to_arrow!(f32, PrimitiveArray::<Float32Type>::from, $value, $chunk_vec; $length),
1072 F64 => dtype_case_to_arrow!(f64, PrimitiveArray::<Float64Type>::from, $value, $chunk_vec; $length),
1073 }
1074 }};
1075}
1076
1077fn add_vec<T: std::ops::Add<T, Output = T> + Clone>(v: Vec<T>, w: Vec<T>) -> Series
1078where
1079 Series: TypedVector<T>,
1080{
1081 Series::new(v.into_iter().zip(w).map(|(x, y)| x + y).collect::<Vec<T>>())
1082}
1083
1084fn sub_vec<T: std::ops::Sub<T, Output = T> + Clone>(v: Vec<T>, w: Vec<T>) -> Series
1085where
1086 Series: TypedVector<T>,
1087{
1088 Series::new(v.into_iter().zip(w).map(|(x, y)| x - y).collect::<Vec<T>>())
1089}
1090
1091fn mul_scalar<T: std::ops::Mul<T, Output = T> + Clone + Copy>(v: Vec<T>, s: T) -> Series
1092where
1093 Series: TypedVector<T>,
1094{
1095 Series::new(v.into_iter().map(|x| x * s).collect::<Vec<T>>())
1096}
1097
1098impl DType {
1102 pub fn is_numeric(&self) -> bool {
1104 match self {
1105 Bool => false,
1106 Str => false,
1107 Char => false,
1108 USIZE => false,
1109 ISIZE => false,
1110 _ => true,
1111 }
1112 }
1113
1114 pub fn is_integer(&self) -> bool {
1115 match self {
1116 Bool => false,
1117 Str => false,
1118 Char => false,
1119 F32 => false,
1120 F64 => false,
1121 _ => true,
1122 }
1123 }
1124}
1125
1126impl fmt::Display for DType {
1127 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1128 let st = match self {
1129 USIZE => "usize",
1130 U8 => "u8",
1131 U16 => "u16",
1132 U32 => "u32",
1133 U64 => "u64",
1134 ISIZE => "isize",
1135 I8 => "i8",
1136 I16 => "i16",
1137 I32 => "i32",
1138 I64 => "i64",
1139 F32 => "f32",
1140 F64 => "f64",
1141 Bool => "bool",
1142 Char => "char",
1143 Str => "String",
1144 };
1145 write!(f, "{}", st)
1146 }
1147}
1148
1149impl fmt::Display for DTypeArray {
1150 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1151 let st = match self {
1152 DTypeArray::USIZE(v) => format!("array: {:?}\ndtype: usize", v),
1153 DTypeArray::U8(v) => format!("array: {:?}\ndtype: u8", v),
1154 DTypeArray::U16(v) => format!("array: {:?}\ndtype: u16", v),
1155 DTypeArray::U32(v) => format!("array: {:?}\ndtype: u32", v),
1156 DTypeArray::U64(v) => format!("array: {:?}\ndtype: u64", v),
1157 DTypeArray::ISIZE(v) => format!("array: {:?}\ndtype: isize", v),
1158 DTypeArray::I8(v) => format!("array: {:?}\ndtype: i8", v),
1159 DTypeArray::I16(v) => format!("array: {:?}\ndtype: i16", v),
1160 DTypeArray::I32(v) => format!("array: {:?}\ndtype: i32", v),
1161 DTypeArray::I64(v) => format!("array: {:?}\ndtype: i64", v),
1162 DTypeArray::F32(v) => format!("array: {}\ndtype: f32", format_float_vec!(v)),
1163 DTypeArray::F64(v) => format!("array: {}\ndtype: f64", format_float_vec!(v)),
1164 DTypeArray::Bool(v) => format!("array: {:?}\ndtype: bool", v),
1165 DTypeArray::Str(v) => format!("array: {:?}\ndtype: String", v),
1166 DTypeArray::Char(v) => format!("array: {:?}\ndtype: char", v),
1167 };
1168 write!(f, "{}", st)
1169 }
1170}
1171
1172impl Scalar {
1177 pub fn to_series(self) -> Series {
1179 dtype_match!(self.dtype, vec![self.unwrap()], Series::new; Vec)
1180 }
1181
1182 pub fn to_string(self) -> String {
1183 dtype_match!(self.dtype, self.unwrap(), to_string)
1184 }
1185}
1186
1187impl Series {
1188 pub fn at(&self, i: usize) -> Scalar {
1204 dtype_match!(self.dtype, self.at_raw(i), Scalar::new)
1205 }
1206
1207 pub fn len(&self) -> usize {
1209 dtype_match!(self.dtype, self.as_slice().to_vec(), len; Vec)
1210 }
1211
1212 pub fn to_type(&self, dtype: DType) -> Series {
1214 dtype_cast_vec!(self.dtype, dtype, self.to_vec(), Series::new)
1215 }
1216
1217 pub fn as_type(&mut self, dtype: DType) {
1233 let x = self.to_type(dtype);
1234 self.dtype = x.dtype;
1235 self.values = x.values;
1236 }
1237
1238 pub fn select_indices(&self, indices: &[usize]) -> Series {
1253 macro_rules! extract_by_indices {
1254 ($array:expr, $type:ty) => {{
1255 let values: Vec<$type> = indices.iter().map(|&i| $array[i].clone()).collect();
1256 Series::new(values)
1257 }};
1258 }
1259
1260 match &self.values {
1261 DTypeArray::USIZE(v) => extract_by_indices!(v, usize),
1262 DTypeArray::U8(v) => extract_by_indices!(v, u8),
1263 DTypeArray::U16(v) => extract_by_indices!(v, u16),
1264 DTypeArray::U32(v) => extract_by_indices!(v, u32),
1265 DTypeArray::U64(v) => extract_by_indices!(v, u64),
1266 DTypeArray::ISIZE(v) => extract_by_indices!(v, isize),
1267 DTypeArray::I8(v) => extract_by_indices!(v, i8),
1268 DTypeArray::I16(v) => extract_by_indices!(v, i16),
1269 DTypeArray::I32(v) => extract_by_indices!(v, i32),
1270 DTypeArray::I64(v) => extract_by_indices!(v, i64),
1271 DTypeArray::F32(v) => extract_by_indices!(v, f32),
1272 DTypeArray::F64(v) => extract_by_indices!(v, f64),
1273 DTypeArray::Bool(v) => extract_by_indices!(v, bool),
1274 DTypeArray::Str(v) => extract_by_indices!(v, String),
1275 DTypeArray::Char(v) => extract_by_indices!(v, char),
1276 }
1277 }
1278
1279 pub fn to_f64_vec(&self) -> anyhow::Result<Vec<f64>> {
1283 match self.dtype {
1284 Bool | Char | Str => anyhow::bail!("Cannot convert {} Series to f64", self.dtype),
1285 _ => {
1286 let converted = self.to_type(F64);
1287 Ok(TypedVector::<f64>::to_vec(&converted))
1288 }
1289 }
1290 }
1291
1292 pub fn sum(&self) -> anyhow::Result<f64> {
1298 let v = self.to_f64_vec()?;
1299 Ok(v.iter().sum())
1300 }
1301
1302 pub fn mean(&self) -> anyhow::Result<f64> {
1304 use crate::statistics::stat::Statistics;
1305 let v = self.to_f64_vec()?;
1306 anyhow::ensure!(!v.is_empty(), "Cannot compute mean of empty Series");
1307 Ok(v.mean())
1308 }
1309
1310 pub fn var(&self) -> anyhow::Result<f64> {
1312 use crate::statistics::stat::Statistics;
1313 let v = self.to_f64_vec()?;
1314 anyhow::ensure!(v.len() > 1, "Cannot compute variance of Series with fewer than 2 elements");
1315 Ok(v.var())
1316 }
1317
1318 pub fn sd(&self) -> anyhow::Result<f64> {
1320 use crate::statistics::stat::Statistics;
1321 let v = self.to_f64_vec()?;
1322 anyhow::ensure!(v.len() > 1, "Cannot compute sd of Series with fewer than 2 elements");
1323 Ok(v.sd())
1324 }
1325
1326 pub fn min(&self) -> anyhow::Result<Scalar> {
1328 anyhow::ensure!(self.len() > 0, "Cannot compute min of empty Series");
1329
1330 macro_rules! typed_min {
1331 ($v:expr, $dtype:ident) => {{
1332 let min_val = $v.iter().cloned().reduce(|a, b| if a <= b { a } else { b }).unwrap();
1333 Ok(Scalar { value: DTypeValue::$dtype(min_val), dtype: DType::$dtype })
1334 }};
1335 }
1336
1337 match &self.values {
1338 DTypeArray::USIZE(v) => typed_min!(v, USIZE),
1339 DTypeArray::U8(v) => typed_min!(v, U8),
1340 DTypeArray::U16(v) => typed_min!(v, U16),
1341 DTypeArray::U32(v) => typed_min!(v, U32),
1342 DTypeArray::U64(v) => typed_min!(v, U64),
1343 DTypeArray::ISIZE(v) => typed_min!(v, ISIZE),
1344 DTypeArray::I8(v) => typed_min!(v, I8),
1345 DTypeArray::I16(v) => typed_min!(v, I16),
1346 DTypeArray::I32(v) => typed_min!(v, I32),
1347 DTypeArray::I64(v) => typed_min!(v, I64),
1348 DTypeArray::F32(v) => typed_min!(v, F32),
1349 DTypeArray::F64(v) => typed_min!(v, F64),
1350 DTypeArray::Bool(v) => typed_min!(v, Bool),
1351 DTypeArray::Char(v) => typed_min!(v, Char),
1352 DTypeArray::Str(v) => typed_min!(v, Str),
1353 }
1354 }
1355
1356 pub fn max(&self) -> anyhow::Result<Scalar> {
1358 anyhow::ensure!(self.len() > 0, "Cannot compute max of empty Series");
1359
1360 macro_rules! typed_max {
1361 ($v:expr, $dtype:ident) => {{
1362 let max_val = $v.iter().cloned().reduce(|a, b| if a >= b { a } else { b }).unwrap();
1363 Ok(Scalar { value: DTypeValue::$dtype(max_val), dtype: DType::$dtype })
1364 }};
1365 }
1366
1367 match &self.values {
1368 DTypeArray::USIZE(v) => typed_max!(v, USIZE),
1369 DTypeArray::U8(v) => typed_max!(v, U8),
1370 DTypeArray::U16(v) => typed_max!(v, U16),
1371 DTypeArray::U32(v) => typed_max!(v, U32),
1372 DTypeArray::U64(v) => typed_max!(v, U64),
1373 DTypeArray::ISIZE(v) => typed_max!(v, ISIZE),
1374 DTypeArray::I8(v) => typed_max!(v, I8),
1375 DTypeArray::I16(v) => typed_max!(v, I16),
1376 DTypeArray::I32(v) => typed_max!(v, I32),
1377 DTypeArray::I64(v) => typed_max!(v, I64),
1378 DTypeArray::F32(v) => typed_max!(v, F32),
1379 DTypeArray::F64(v) => typed_max!(v, F64),
1380 DTypeArray::Bool(v) => typed_max!(v, Bool),
1381 DTypeArray::Char(v) => typed_max!(v, Char),
1382 DTypeArray::Str(v) => typed_max!(v, Str),
1383 }
1384 }
1385}
1386
1387impl Vector for Series {
1388 type Scalar = Scalar;
1389
1390 fn add_vec(&self, rhs: &Self) -> Self {
1406 assert_eq!(self.dtype, rhs.dtype, "DTypes are not same (add_vec)");
1407 dtype_match!(
1408 N;
1409 self.dtype,
1410 self.to_vec(),
1411 |x| add_vec(x, rhs.to_vec());
1412 Vec
1413 )
1414 }
1415
1416 fn sub_vec(&self, rhs: &Self) -> Self {
1432 assert_eq!(self.dtype, rhs.dtype, "DTypes are not same (add_vec)");
1433 dtype_match!(
1434 N;
1435 self.dtype,
1436 self.to_vec(),
1437 |x| sub_vec(x, rhs.to_vec());
1438 Vec
1439 )
1440 }
1441
1442 fn mul_scalar(&self, rhs: Self::Scalar) -> Self {
1458 assert_eq!(self.dtype, rhs.dtype, "DTypes are not same (mul_scalar)");
1459
1460 dtype_match!(
1461 N;
1462 self.dtype,
1463 self.to_vec(),
1464 |x| mul_scalar(x, rhs.unwrap());
1465 Vec
1466 )
1467 }
1468}
1469
1470impl_typed_scalar!(usize, USIZE);
1471impl_typed_scalar!(u8, U8);
1472impl_typed_scalar!(u16, U16);
1473impl_typed_scalar!(u32, U32);
1474impl_typed_scalar!(u64, U64);
1475impl_typed_scalar!(isize, ISIZE);
1476impl_typed_scalar!(i8, I8);
1477impl_typed_scalar!(i16, I16);
1478impl_typed_scalar!(i32, I32);
1479impl_typed_scalar!(i64, I64);
1480impl_typed_scalar!(f32, F32);
1481impl_typed_scalar!(f64, F64);
1482impl_typed_scalar!(bool, Bool);
1483impl_typed_scalar!(char, Char);
1484impl_typed_scalar!(String, Str);
1485
1486impl_typed_vector!(usize, USIZE);
1487impl_typed_vector!(u8, U8);
1488impl_typed_vector!(u16, U16);
1489impl_typed_vector!(u32, U32);
1490impl_typed_vector!(u64, U64);
1491impl_typed_vector!(isize, ISIZE);
1492impl_typed_vector!(i8, I8);
1493impl_typed_vector!(i16, I16);
1494impl_typed_vector!(i32, I32);
1495impl_typed_vector!(i64, I64);
1496impl_typed_vector!(f32, F32);
1497impl_typed_vector!(f64, F64);
1498impl_typed_vector!(bool, Bool);
1499impl_typed_vector!(char, Char);
1500impl_typed_vector!(String, Str);
1501
1502impl fmt::Display for Scalar {
1503 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1504 let st = format!("{}, dtype:{}", self.clone().to_string(), self.dtype);
1505 write!(f, "{}", st)
1506 }
1507}
1508
1509impl DataFrame {
1585 pub fn new(v: Vec<Series>) -> Self {
1587 let ics = (0usize..v.len()).map(|x| x.to_string()).collect();
1588
1589 Self { data: v, ics }
1590 }
1591
1592 pub fn header(&self) -> &Vec<String> {
1593 &self.ics
1594 }
1595
1596 pub fn header_mut(&mut self) -> &mut Vec<String> {
1597 &mut self.ics
1598 }
1599
1600 pub fn set_header(&mut self, new_header: Vec<&str>) {
1602 assert_eq!(self.ics.len(), new_header.len(), "Improper Header length!");
1603 self.ics = new_header.into_iter().map(|x| x.to_string()).collect();
1604 }
1605
1606 pub fn push(&mut self, name: &str, series: Series) {
1608 if !self.ics.is_empty() {
1609 assert_eq!(
1610 self.ics.iter().find(|x| x.as_str() == name),
1611 None,
1612 "Repetitive index!"
1613 );
1614 }
1615 self.ics.push(name.to_string());
1616 self.data.push(series);
1617 }
1618
1619 pub fn row(&self, i: usize) -> DataFrame {
1621 let mut df = DataFrame::new(vec![]);
1622 for (j, series) in self.data.iter().enumerate() {
1623 let s = series.at(i);
1624 let new_series = s.to_series();
1625 df.push(&self.ics[j], new_series);
1626 }
1627 df
1628 }
1629
1630 pub fn spread(&self) -> String {
1631 let r: usize = self
1632 .data
1633 .iter()
1634 .fold(0, |max_len, column| max(max_len, column.len()));
1635 let h = self.header();
1636
1637 let mut result = String::new();
1638
1639 if r > 100 {
1640 let lc1 = ((r as f64).log10() as usize) + 5;
1641 result.push_str(&tab("", lc1));
1642
1643 let mut space_vec: Vec<usize> = vec![];
1644 for i in 0..self.data.len() {
1645 let v = &self[i];
1646 let mut space = 0usize;
1647 for j in 0..v.len().min(5) {
1648 let elem = v.at(j);
1649 set_space!(elem, space);
1650 }
1651 if v.len() >= r - 5 {
1652 for j in v.len() - 5..v.len() {
1653 let elem = v.at(j);
1654 set_space!(elem, space);
1655 }
1656 }
1657 space = max(space + 1, 5);
1658 let k = &h[i];
1659 if k.len() >= space {
1660 space = k.len() + 1;
1661 }
1662 result.push_str(&tab(k, space));
1663 space_vec.push(space);
1664 }
1665 result.push('\n');
1666
1667 for i in 0..5 {
1668 result.push_str(&tab(&format!("r[{}]", i), lc1));
1669 for j in 0..self.data.len() {
1670 let v = &self[j];
1671 let space = space_vec[j];
1672 if i < v.len() {
1673 let elem = v.at(i);
1674 let st = set_space!(elem);
1675 result.push_str(&tab(&st, space));
1676 } else {
1677 result.push_str(&tab("", space));
1678 }
1679 }
1680 result.push('\n');
1681 }
1682 result.push_str(&tab("...", lc1));
1683 for &space in space_vec.iter() {
1684 result.push_str(&tab("...", space));
1685 }
1686 result.push('\n');
1687 for i in r - 5..r {
1688 result.push_str(&tab(&format!("r[{}]", i), lc1));
1689 for j in 0..self.data.len() {
1690 let v = &self[j];
1691 let space = space_vec[j];
1692 if i < v.len() {
1693 let elem = v.at(i);
1694 let st = set_space!(elem);
1695 result.push_str(&tab(&st, space));
1696 } else {
1697 result.push_str(&tab("", space));
1698 }
1699 }
1700 if i == r - 1 {
1701 break;
1702 }
1703 result.push('\n');
1704 }
1705 return result;
1706 }
1707
1708 result.push_str(&tab("", 5));
1709 let mut space_vec: Vec<usize> = vec![];
1710
1711 for i in 0..self.data.len() {
1712 let v = &self[i];
1713 let mut space = 0usize;
1714 for j in 0..v.len() {
1715 let elem = v.at(j);
1716 set_space!(elem, space)
1717 }
1718 space = max(space + 1, 5);
1719 let k = &h[i];
1720 if k.len() >= space {
1721 space = k.len() + 1;
1722 }
1723 result.push_str(&tab(k, space));
1724 space_vec.push(space);
1725 }
1726 result.push('\n');
1727
1728 for i in 0..r {
1729 result.push_str(&tab(&format!("r[{}]", i), 5));
1730 for j in 0..self.data.len() {
1731 let v = &self[j];
1732 let space = space_vec[j];
1733 if i < v.len() {
1734 let elem = v.at(i);
1735 let st = set_space!(elem);
1736 result.push_str(&tab(&st, space));
1737 } else {
1738 result.push_str(&tab("", space));
1739 }
1740 }
1741 if i == (r - 1) {
1742 break;
1743 }
1744 result.push('\n');
1745 }
1746 result
1747 }
1748
1749 pub fn as_types(&mut self, dtypes: Vec<DType>) {
1772 assert_eq!(
1773 self.data.len(),
1774 dtypes.len(),
1775 "Length of dtypes are not compatible with DataFrame"
1776 );
1777 for (i, dtype) in dtypes.into_iter().enumerate() {
1778 self[i].as_type(dtype);
1779 }
1780 }
1781
1782 pub fn drop(&mut self, col_header: &str) {
1806 match self.ics.iter().position(|h| h == col_header) {
1807 Some(index) => {
1808 self.data.remove(index);
1809 self.ics.remove(index);
1810 }
1811 None => panic!("Can't drop header '{}'", col_header),
1812 }
1813 }
1814
1815 pub fn filter_by<F>(&self, column: &str, predicate: F) -> anyhow::Result<DataFrame>
1817 where
1818 F: Fn(Scalar) -> bool,
1819 {
1820 let series = match self.ics.iter().position(|x| x.as_str() == column) {
1821 Some(i) => &self.data[i],
1822 None => anyhow::bail!("Column '{}' not found in DataFrame", column),
1823 };
1824
1825 let mut indices = Vec::new();
1826 for i in 0..series.len() {
1827 let value = series.at(i);
1828 if predicate(value) {
1829 indices.push(i);
1830 }
1831 }
1832
1833 let mut new_df = DataFrame::new(vec![]);
1834 for (col_idx, col_series) in self.data.iter().enumerate() {
1835 let filtered_series = col_series.select_indices(&indices);
1836 new_df.push(&self.ics[col_idx], filtered_series);
1837 }
1838
1839 Ok(new_df)
1840 }
1841
1842 pub fn mask(&self, mask: &Series) -> anyhow::Result<DataFrame> {
1844 if mask.len() != self.data[0].len() {
1845 anyhow::bail!(
1846 "Mask length ({}) does not match DataFrame row count ({})",
1847 mask.len(),
1848 self.data[0].len()
1849 );
1850 }
1851
1852 if mask.dtype != DType::Bool {
1853 anyhow::bail!("Mask Series must be of type Bool, but got {}", mask.dtype);
1854 }
1855
1856 let bool_mask: &[bool] = mask.as_slice();
1857 let ics: Vec<usize> = bool_mask
1858 .iter()
1859 .enumerate()
1860 .filter_map(|(i, &b)| if b { Some(i) } else { None })
1861 .collect();
1862
1863 Ok(self.select_rows(&ics))
1864 }
1865
1866 pub fn select_rows(&self, indices: &[usize]) -> DataFrame {
1868 let mut new_df = DataFrame::new(vec![]);
1869 for (col_idx, col_series) in self.data.iter().enumerate() {
1870 let filtered_series = col_series.select_indices(indices);
1871 new_df.push(&self.ics[col_idx], filtered_series);
1872 }
1873 new_df
1874 }
1875
1876 pub fn nrow(&self) -> usize {
1882 self.data.iter().fold(0, |acc, s| max(acc, s.len()))
1883 }
1884
1885 pub fn ncol(&self) -> usize {
1887 self.data.len()
1888 }
1889
1890 pub fn shape(&self) -> (usize, usize) {
1892 (self.nrow(), self.ncol())
1893 }
1894
1895 pub fn dtypes(&self) -> Vec<DType> {
1897 self.data.iter().map(|s| s.dtype).collect()
1898 }
1899
1900 pub fn is_empty(&self) -> bool {
1902 self.data.is_empty() || self.nrow() == 0
1903 }
1904
1905 pub fn contains(&self, col_header: &str) -> bool {
1907 self.ics.iter().any(|x| x.as_str() == col_header)
1908 }
1909
1910 pub fn head(&self, n: usize) -> DataFrame {
1916 let nrow = self.nrow();
1917 let end = n.min(nrow);
1918 let indices: Vec<usize> = (0..end).collect();
1919 self.select_rows(&indices)
1920 }
1921
1922 pub fn tail(&self, n: usize) -> DataFrame {
1924 let nrow = self.nrow();
1925 let start = nrow.saturating_sub(n);
1926 let indices: Vec<usize> = (start..nrow).collect();
1927 self.select_rows(&indices)
1928 }
1929
1930 pub fn slice(&self, offset: usize, length: usize) -> DataFrame {
1932 let nrow = self.nrow();
1933 let end = (offset + length).min(nrow);
1934 let indices: Vec<usize> = (offset..end).collect();
1935 self.select_rows(&indices)
1936 }
1937
1938 pub fn select(&self, columns: &[&str]) -> DataFrame {
1946 let mut new_df = DataFrame::new(vec![]);
1947 for &col in columns {
1948 let i = self
1949 .ics
1950 .iter()
1951 .position(|x| x.as_str() == col)
1952 .unwrap_or_else(|| panic!("Column '{}' not found in DataFrame", col));
1953 new_df.push(col, self.data[i].clone());
1954 }
1955 new_df
1956 }
1957
1958 pub fn rename(&mut self, old: &str, new: &str) {
1962 let i = self
1963 .ics
1964 .iter()
1965 .position(|x| x.as_str() == old)
1966 .unwrap_or_else(|| panic!("Column '{}' not found in DataFrame", old));
1967 self.ics[i] = new.to_string();
1968 }
1969
1970 pub fn column_names(&self) -> Vec<&str> {
1972 self.ics.iter().map(|s| s.as_str()).collect()
1973 }
1974
1975 pub fn select_dtypes(&self, dtypes: &[DType]) -> DataFrame {
1977 let mut new_df = DataFrame::new(vec![]);
1978 for (i, series) in self.data.iter().enumerate() {
1979 if dtypes.contains(&series.dtype) {
1980 new_df.push(&self.ics[i], series.clone());
1981 }
1982 }
1983 new_df
1984 }
1985
1986 pub fn describe(&self) -> DataFrame {
1995 use crate::statistics::stat::Statistics;
1996
1997 let stat_labels = vec!["count", "mean", "sd", "min", "max"];
1998 let mut result = DataFrame::new(vec![]);
1999 result.push("stat", Series::new(stat_labels.iter().map(|s| s.to_string()).collect::<Vec<String>>()));
2000
2001 for (i, series) in self.data.iter().enumerate() {
2002 if let Ok(v) = series.to_f64_vec() {
2003 if v.is_empty() {
2004 continue;
2005 }
2006 let count = v.len() as f64;
2007 let mean = v.mean();
2008 let sd = if v.len() > 1 { v.sd() } else { 0.0 };
2009 let min_val = v.iter().cloned().fold(f64::INFINITY, f64::min);
2010 let max_val = v.iter().cloned().fold(f64::NEG_INFINITY, f64::max);
2011 result.push(
2012 &self.ics[i],
2013 Series::new(vec![count, mean, sd, min_val, max_val]),
2014 );
2015 }
2016 }
2017
2018 result
2019 }
2020
2021 pub fn sum(&self) -> DataFrame {
2023 let mut result = DataFrame::new(vec![]);
2024 for (i, series) in self.data.iter().enumerate() {
2025 if let Ok(v) = series.to_f64_vec() {
2026 let s: f64 = v.iter().sum();
2027 result.push(&self.ics[i], Series::new(vec![s]));
2028 }
2029 }
2030 result
2031 }
2032
2033 pub fn mean(&self) -> DataFrame {
2035 use crate::statistics::stat::Statistics;
2036
2037 let mut result = DataFrame::new(vec![]);
2038 for (i, series) in self.data.iter().enumerate() {
2039 if let Ok(v) = series.to_f64_vec() {
2040 if v.is_empty() {
2041 continue;
2042 }
2043 let m = v.mean();
2044 result.push(&self.ics[i], Series::new(vec![m]));
2045 }
2046 }
2047 result
2048 }
2049}
2050
2051impl Index<&str> for DataFrame {
2052 type Output = Series;
2053
2054 fn index(&self, index: &str) -> &Self::Output {
2055 let i = self.ics.iter().position(|x| x.as_str() == index).unwrap();
2056 &self.data[i]
2057 }
2058}
2059
2060impl IndexMut<&str> for DataFrame {
2061 fn index_mut(&mut self, index: &str) -> &mut Self::Output {
2062 let i = self.ics.iter().position(|x| x.as_str() == index).unwrap();
2063 &mut self.data[i]
2064 }
2065}
2066
2067impl Index<usize> for DataFrame {
2068 type Output = Series;
2069
2070 fn index(&self, index: usize) -> &Self::Output {
2071 &self.data[index]
2072 }
2073}
2074
2075impl IndexMut<usize> for DataFrame {
2076 fn index_mut(&mut self, index: usize) -> &mut Self::Output {
2077 &mut self.data[index]
2078 }
2079}
2080
2081impl fmt::Display for DataFrame {
2082 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
2083 write!(f, "{}", self.spread())
2084 }
2085}
2086
2087#[cfg(feature = "csv")]
2093pub trait WithCSV: Sized {
2094 fn write_csv(&self, file_path: &str) -> Result<(), Box<dyn Error>>;
2095 fn read_csv(file_path: &str, delimiter: char) -> Result<Self, Box<dyn Error>>;
2096}
2097
2098#[cfg(feature = "csv")]
2099impl WithCSV for DataFrame {
2100 fn write_csv(&self, file_path: &str) -> Result<(), Box<dyn Error>> {
2102 let mut wtr = WriterBuilder::new().from_path(file_path)?;
2103 let r: usize = self
2104 .data
2105 .iter()
2106 .fold(0, |max_len, column| max(max_len, column.len()));
2107 let c: usize = self.data.len();
2108 wtr.write_record(self.header().clone())?;
2109
2110 for i in 0..r {
2111 let mut record: Vec<String> = vec!["".to_string(); c];
2112 for (j, v) in self.data.iter().enumerate() {
2113 if i < v.len() {
2114 record[j] = v.at(i).to_string();
2115 }
2116 }
2117 wtr.write_record(record)?;
2118 }
2119 wtr.flush()?;
2120 Ok(())
2121 }
2122
2123 fn read_csv(file_path: &str, delimiter: char) -> Result<Self, Box<dyn Error>> {
2125 let mut rdr = ReaderBuilder::new()
2126 .has_headers(true)
2127 .delimiter(delimiter as u8)
2128 .from_path(file_path)?;
2129
2130 let headers_vec = rdr.headers()?;
2131 let headers = headers_vec.iter().map(|x| x).collect::<Vec<&str>>();
2132 let mut result = DataFrame::new(vec![]);
2133 for h in headers.iter() {
2134 result.push(*h, Series::new(Vec::<String>::new()));
2135 }
2136
2137 for rec in rdr.deserialize() {
2138 let record: HashMap<String, String> = rec?;
2139 for head in record.keys() {
2140 let value = &record[head];
2141 if value.len() > 0 {
2142 result[head.as_str()].push(value.to_string());
2143 }
2144 }
2145 }
2146
2147 Ok(result)
2148 }
2149}
2150
2151#[cfg(feature = "nc")]
2153pub trait WithNetCDF: Sized {
2154 fn write_nc(&self, file_path: &str) -> Result<(), Box<dyn Error>>;
2155 fn read_nc(file_path: &str) -> Result<Self, Box<dyn Error>>;
2156 fn read_nc_by_header(file_path: &str, header: Vec<&str>) -> Result<Self, Box<dyn Error>>;
2157}
2158
2159#[cfg(feature = "nc")]
2160impl WithNetCDF for DataFrame {
2161 fn write_nc(&self, file_path: &str) -> Result<(), Box<dyn Error>> {
2163 let mut f = netcdf::create(file_path)?;
2164
2165 for (i, h) in self.header().iter().enumerate() {
2166 let dim_name = format!("{}th col", i);
2167 let v = &self[h.as_str()];
2168 let dim = v.len();
2169 f.add_dimension(&dim_name, dim)?;
2170 match v.dtype {
2171 dtype if dtype.is_numeric() => {
2172 let vtype = dtype_to_vtype(dtype);
2173 let var = &mut f.add_variable_with_type(
2174 h,
2175 &[&dim_name],
2176 &VariableType::Basic(vtype),
2177 )?;
2178 dtype_match!(N; dtype, v.to_vec(), |v| nc_put_value(var, v); Vec)?;
2179 }
2180 Str => {
2181 let var = &mut f.add_string_variable(h, &[&dim_name])?;
2182 let v_s: &[String] = v.as_slice();
2183 for (i, s) in v_s.iter().enumerate() {
2184 var.put_string(s, Some(&[i]))?;
2185 }
2186 }
2187 USIZE => {
2188 let v = v.to_type(U64);
2189 let var = &mut f.add_variable::<u64>(h, &[&dim_name])?;
2190 let v_slice: &[u64] = v.as_slice();
2191 var.put_values(v_slice, None, None)?;
2192 }
2193 ISIZE => {
2194 let v = v.to_type(I64);
2195 let var = &mut f.add_variable::<i64>(h, &[&dim_name])?;
2196 let v_slice: &[i64] = v.as_slice();
2197 var.put_values(v_slice, None, None)?;
2198 }
2199 Bool => {
2200 let v = v.to_type(U8);
2201 let var = &mut f.add_variable::<u8>(h, &[&dim_name])?;
2202 let v_slice: &[u8] = v.as_slice();
2203 var.put_values(v_slice, None, None)?;
2204 }
2205 Char => {
2206 let v = v.to_type(U8);
2207 let var = &mut f.add_variable::<u8>(h, &[&dim_name])?;
2208 let v_slice: &[u8] = v.as_slice();
2209 var.put_values(v_slice, None, None)?;
2210 }
2211 _ => unreachable!(),
2212 }
2213 }
2214
2215 Ok(())
2216 }
2217
2218 fn read_nc(file_path: &str) -> Result<Self, Box<dyn Error>> {
2220 let f = netcdf::open(file_path)?;
2221 let mut df = DataFrame::new(vec![]);
2222 for v in f.variables() {
2223 let h = v.name();
2224 if v.vartype().is_string() {
2225 let mut data: Vec<String> = vec![Default::default(); v.len()];
2226 for i in 0..v.len() {
2227 data[i] = v.string_value(Some(&[i]))?;
2228 }
2229 df.push(&h, Series::new(data));
2230 } else {
2231 let dtype = vtype_to_dtype(v.vartype().as_basic().unwrap());
2232 let series = dtype_match!(N; dtype, vec![], |vec| nc_read_value(&v, vec); Vec)?;
2233 df.push(&h, series);
2234 }
2235 }
2236 Ok(df)
2237 }
2238
2239 fn read_nc_by_header(file_path: &str, header: Vec<&str>) -> Result<Self, Box<dyn Error>> {
2265 let f = netcdf::open(file_path)?;
2266 let mut df = DataFrame::new(vec![]);
2267 for h in header {
2268 let v = match f.variable(h) {
2269 Some(val) => val,
2270 None => panic!("There are no corresponding values"),
2271 };
2272 if v.vartype().is_string() {
2273 let mut data: Vec<String> = vec![Default::default(); v.len()];
2274 for i in 0..v.len() {
2275 data[i] = v.string_value(Some(&[i]))?;
2276 }
2277 df.push(&h, Series::new(data));
2278 } else {
2279 let dtype = vtype_to_dtype(v.vartype().as_basic().unwrap());
2280 let series = dtype_match!(N; dtype, vec![], |vec| nc_read_value(&v, vec); Vec)?;
2281 df.push(&h, series);
2282 }
2283 }
2284 Ok(df)
2285 }
2286}
2287
2288#[cfg(feature = "parquet")]
2290pub trait WithParquet {
2291 fn write_parquet(
2292 &self,
2293 file_path: &str,
2294 compression: Compression,
2295 ) -> Result<(), Box<dyn Error>>;
2296 fn read_parquet(file_path: &str) -> Result<Self, Box<dyn Error>>
2297 where
2298 Self: Sized;
2299 }
2301
2302#[cfg(feature = "parquet")]
2314macro_rules! process_column {
2315 ($hash_map:expr, $h:expr, $arr:expr, $arrow_type:ty, $rust_type:ty, |$concrete_array:ident| $extract_body:expr) => {{
2316 let $concrete_array = $arr.as_any().downcast_ref::<$arrow_type>().unwrap();
2318 let data: Vec<$rust_type> = $extract_body;
2320
2321 if let Some(existing_data) = $hash_map.get_mut($h) {
2323 let mut vec_data: Vec<$rust_type> = existing_data.to_vec();
2325 vec_data.extend(data.iter().cloned());
2326 $hash_map.insert($h.clone(), Series::new(vec_data));
2327 } else {
2328 $hash_map.insert($h.clone(), Series::new(data));
2330 }
2331 }};
2332}
2333
2334#[cfg(feature = "parquet")]
2335impl WithParquet for DataFrame {
2336 fn write_parquet(
2338 &self,
2339 file_path: &str,
2340 compression: Compression,
2341 ) -> Result<(), Box<dyn Error>> {
2342 let mut schema_vec = vec![];
2343 let mut arr_vec = vec![];
2344
2345 let max_length = self.data.iter().fold(0usize, |acc, x| acc.max(x.len()));
2346
2347 for h in self.header().iter() {
2348 let v = &self[h.as_str()];
2349 let field = Field::new(h.as_str(), dtype_to_arrow(v.dtype), false);
2350
2351 dtype_match_to_arrow!(v.dtype, v.to_vec(), arr_vec; max_length);
2352 schema_vec.push(field);
2353 }
2354
2355 let schema = Arc::new(Schema::new(schema_vec));
2356 let parquet_schema = ArrowSchemaConverter::new()
2357 .convert(&schema)
2358 .map_err(|e| format!("Failed to convert schema: {}", e))?;
2359 let writer_properties = WriterProperties::builder()
2360 .set_compression(compression)
2361 .build();
2362 let props = Arc::new(writer_properties);
2363
2364 let col_writers = get_column_writers(&parquet_schema, &props, &schema)?;
2365 let mut workers: Vec<_> = col_writers
2366 .into_iter()
2367 .map(|mut col_writer| {
2368 let (send, recv) = std::sync::mpsc::channel::<ArrowLeafColumn>();
2369 let handle = std::thread::spawn(move || {
2370 for col in recv {
2371 col_writer.write(&col)?;
2372 }
2373 col_writer.close()
2374 });
2375 (handle, send)
2376 })
2377 .collect();
2378
2379 let root_schema = parquet_schema.root_schema_ptr();
2380 let mut output_file = std::fs::File::create(file_path)?;
2381 let mut writer = SerializedFileWriter::new(&mut output_file, root_schema, props.clone())?;
2382
2383 let mut row_group_writer: SerializedRowGroupWriter<'_, _> = writer.next_row_group()?;
2384
2385 let mut worker_iter = workers.iter_mut();
2386 for (arr, field) in arr_vec.iter().zip(&schema.fields) {
2387 for leaves in compute_leaves(field, &Arc::new(arr))? {
2388 worker_iter.next().unwrap().1.send(leaves)?;
2389 }
2390 }
2391
2392 for (handle, send) in workers {
2393 use parquet::arrow::arrow_writer::ArrowColumnChunk;
2394
2395 drop(send);
2396 let chunk: ArrowColumnChunk = handle.join().unwrap().unwrap();
2397 chunk.append_to_row_group(&mut row_group_writer)?;
2398 }
2399 row_group_writer.close()?;
2400 writer.close()?;
2401
2402 Ok(())
2403 }
2404
2405 fn read_parquet(file_path: &str) -> Result<Self, Box<dyn Error>>
2407 where
2408 Self: Sized,
2409 {
2410 use parquet::arrow::arrow_reader::ParquetRecordBatchReader;
2411
2412 let mut df = DataFrame::new(vec![]);
2413
2414 let file = std::fs::File::open(file_path)?;
2415 let builder = ParquetRecordBatchReaderBuilder::try_new(file.try_clone()?)?;
2416 let schema = builder.schema();
2417 let fields = schema.fields.clone();
2418 let mut batch_size = usize::MAX; let reader: ParquetRecordBatchReader = loop {
2420 let builder = ParquetRecordBatchReaderBuilder::try_new(file.try_clone()?)?;
2421 let reader = builder.with_batch_size(batch_size).build();
2422 match reader {
2423 Ok(r) => break r,
2424 Err(e) => {
2425 if batch_size > 0 {
2426 batch_size /= 10; } else {
2428 println!(
2429 "Failed to read parquet file: {} with eventually batch size 1",
2430 e
2431 );
2432 return Err(Box::new(e));
2433 }
2434 }
2435 }
2436 };
2437 let all_batches: Vec<_> = reader.collect::<Result<Vec<_>, _>>()?;
2438
2439 let mut hash_map = IndexMap::<String, Series>::new();
2440 for batch in all_batches {
2441 let arrs = batch.columns();
2442
2443 for (field, arr) in fields.iter().zip(arrs) {
2444 let h = field.name();
2445 let dt = field.data_type();
2446 let at = arrow_to_dtype(dt.clone());
2447 match at {
2448 Bool => process_column!(hash_map, h, arr, BooleanArray, bool, |d| d
2449 .values()
2450 .iter()
2451 .collect()),
2452 Char => process_column!(hash_map, h, arr, StringArray, char, |d| d
2453 .iter()
2454 .filter_map(|opt_s| opt_s.and_then(|s| s.chars().next()))
2455 .collect()),
2456 Str => process_column!(hash_map, h, arr, StringArray, String, |d| d
2457 .iter()
2458 .filter_map(|opt_s| opt_s.map(String::from))
2459 .collect()),
2460 USIZE => {
2461 process_column!(hash_map, h, arr, PrimitiveArray<UInt64Type>, usize, |d| d
2462 .values()
2463 .iter()
2464 .map(|&x| x as usize)
2465 .collect())
2466 }
2467 U8 => process_column!(hash_map, h, arr, PrimitiveArray<UInt8Type>, u8, |d| d
2468 .values()
2469 .to_vec()),
2470 U16 => {
2471 process_column!(hash_map, h, arr, PrimitiveArray<UInt16Type>, u16, |d| d
2472 .values()
2473 .to_vec())
2474 }
2475 U32 => {
2476 process_column!(hash_map, h, arr, PrimitiveArray<UInt32Type>, u32, |d| d
2477 .values()
2478 .to_vec())
2479 }
2480 U64 => {
2481 process_column!(hash_map, h, arr, PrimitiveArray<UInt64Type>, u64, |d| d
2482 .values()
2483 .to_vec())
2484 }
2485 ISIZE => {
2486 process_column!(hash_map, h, arr, PrimitiveArray<Int64Type>, isize, |d| d
2487 .values()
2488 .iter()
2489 .map(|&x| x as isize)
2490 .collect())
2491 }
2492 I8 => process_column!(hash_map, h, arr, PrimitiveArray<Int8Type>, i8, |d| d
2493 .values()
2494 .to_vec()),
2495 I16 => process_column!(hash_map, h, arr, PrimitiveArray<Int16Type>, i16, |d| d
2496 .values()
2497 .to_vec()),
2498 I32 => process_column!(hash_map, h, arr, PrimitiveArray<Int32Type>, i32, |d| d
2499 .values()
2500 .to_vec()),
2501 I64 => process_column!(hash_map, h, arr, PrimitiveArray<Int64Type>, i64, |d| d
2502 .values()
2503 .to_vec()),
2504 F32 => {
2505 process_column!(hash_map, h, arr, PrimitiveArray<Float32Type>, f32, |d| d
2506 .values()
2507 .to_vec())
2508 }
2509 F64 => {
2510 process_column!(hash_map, h, arr, PrimitiveArray<Float64Type>, f64, |d| d
2511 .values()
2512 .to_vec())
2513 }
2514 }
2515 }
2516 }
2517
2518 for (h, data) in hash_map {
2519 df.push(&h, data);
2520 }
2521
2522 Ok(df)
2523 }
2524}