Can't get tf2 training to work with CSV dataset

Hi, I am trying to create a model with my csv dataset. The dataset has this CSV format:

filename,width,height,class,xmin,ymin,ymax,xmax
6774t.jpg,6016,4016,microchip,3698,65,395,4028
6774t.jpg,6016,4016,connector,3028,814,1453,3102
6774t.jpg,6016,4016,microchip,3362,1901,2132,3560

I am trying to train the model in a notebook on a paperspace virtual machine. here is the code that I use to import my dataset:

np.set_printoptions(precision=3, suppress=True)

pcb_train = pd.read_csv(
    '/notebooks/CSV/Photos/Dataset.csv')

pcb_train.head()

After that I convert the images to numeric arrays:

from PIL import Image
import numpy as np

def load_data(filename):  # Assuming desired image size
  """Loads image, resizes, converts to NumPy array, and extracts label."""
  image_path = f"/notebooks/CSV/Photos/{filename}"  # Assuming image path format
  img = Image.open(image_path)

  img_array = np.array(img)

  # Assuming 'class' is the label column in your DataFrame
  label_row = pcb_train[pcb_train['filename'] == filename]
  label = label_row['class'].values[0]

  return img_array, label


feature_array = []
oldfilename = ''
processed_count = 0

for index, row in pcb_train.iterrows():
    filename = row['filename']

    try:
        # Check if filename is the same as the previous one
        if filename == oldfilename:
            # If same, reuse the already loaded image array (efficiency)
            feature_array.append(img_array)
            processed_count += 1
            new_progress_message = f"Progress: {processed_count}/{len(pcb_train)}"
            oldfilename = filename
        else:
            # If different, load the image and label using the function
            img_array, label = load_data(filename)
            feature_array.append(img_array)
            processed_count += 1
            new_progress_message = f"Progress: {processed_count}/{len(pcb_train)}"
            oldfilename = filename
        print(f"\r{new_progress_message}", end="")

    except (FileNotFoundError, KeyError) as e:
        print(f"Error processing file: {filename} - {e}")
        # Consider logging the error or taking specific actions
pcb_train['filename'] = feature_array
new_progress_message = "Progress: finished"
print(f"\r{new_progress_message}", end="")
pcb_train.head()

this results in this format:

[[[51, 45, 9], [50, 44, 8], [49, 43, 7], [50, ...	6016	4016	microchip	3698	65	395	4028

now I still need to split the labels from the rest of the data, I did that this way:

pcb_features = pcb_train.copy()
pcb_labels = pcb_features.pop('class')

print(pcb_labels.head())
print(pcb_features.head())

which results in this:

0    microchip
1    connector
2    microchip
3     inductor
4     inductor
Name: class, dtype: object
                                            filename  width  height  xmin  \
0  [[[51, 45, 9], [50, 44, 8], [49, 43, 7], [50, ...   6016    4016  3698   
1  [[[51, 45, 9], [50, 44, 8], [49, 43, 7], [50, ...   6016    4016  3028   
2  [[[51, 45, 9], [50, 44, 8], [49, 43, 7], [50, ...   6016    4016  3362   
3  [[[51, 45, 9], [50, 44, 8], [49, 43, 7], [50, ...   6016    4016  5076   
4  [[[51, 45, 9], [50, 44, 8], [49, 43, 7], [50, ...   6016    4016  5076   

   ymin  ymax  xmax  
0    65   395  4028  
1   814  1453  3102  
2  1901  2132  3560  
3  1420  1589  5248  
4   869  1041  5248  

but now when I try to run train the model with this code:


# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(pcb_features, pcb_labels, test_size=0.2, random_state=42)


# print(X_train.head)
# print(y_train.head)

# Define a Logistic Regression model
model = LogisticRegression(solver='lbfgs')

model.fit(X_train, y_train) 

I get this error:


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
TypeError: only size-1 arrays can be converted to Python scalars

The above exception was the direct cause of the following exception:

ValueError                                Traceback (most recent call last)
Cell In [81], line 16
     10 # print(X_train.head)
     11 # print(y_train.head)
     12 
     13 # Define a Logistic Regression model
     14 model = LogisticRegression(solver='lbfgs')
---> 16 model.fit(X_train, y_train) 

File /usr/local/lib/python3.9/dist-packages/sklearn/linear_model/_logistic.py:1138, in LogisticRegression.fit(self, X, y, sample_weight)
   1135 else:
   1136     _dtype = [np.float64, np.float32]
-> 1138 X, y = self._validate_data(
   1139     X,
   1140     y,
   1141     accept_sparse="csr",
   1142     dtype=_dtype,
   1143     order="C",
   1144     accept_large_sparse=solver not in ["liblinear", "sag", "saga"],
   1145 )
   1146 check_classification_targets(y)
   1147 self.classes_ = np.unique(y)

File /usr/local/lib/python3.9/dist-packages/sklearn/base.py:596, in BaseEstimator._validate_data(self, X, y, reset, validate_separately, **check_params)
    594         y = check_array(y, input_name="y", **check_y_params)
    595     else:
--> 596         X, y = check_X_y(X, y, **check_params)
    597     out = X, y
    599 if not no_val_X and check_params.get("ensure_2d", True):

File /usr/local/lib/python3.9/dist-packages/sklearn/utils/validation.py:1074, in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)
   1069         estimator_name = _check_estimator_name(estimator)
   1070     raise ValueError(
   1071         f"{estimator_name} requires y to be passed, but the target y is None"
   1072     )
-> 1074 X = check_array(
   1075     X,
   1076     accept_sparse=accept_sparse,
   1077     accept_large_sparse=accept_large_sparse,
   1078     dtype=dtype,
   1079     order=order,
   1080     copy=copy,
   1081     force_all_finite=force_all_finite,
   1082     ensure_2d=ensure_2d,
   1083     allow_nd=allow_nd,
   1084     ensure_min_samples=ensure_min_samples,
   1085     ensure_min_features=ensure_min_features,
   1086     estimator=estimator,
   1087     input_name="X",
   1088 )
   1090 y = _check_y(y, multi_output=multi_output, y_numeric=y_numeric, estimator=estimator)
   1092 check_consistent_length(X, y)

File /usr/local/lib/python3.9/dist-packages/sklearn/utils/validation.py:856, in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name)
    854         array = array.astype(dtype, casting="unsafe", copy=False)
    855     else:
--> 856         array = np.asarray(array, order=order, dtype=dtype)
    857 except ComplexWarning as complex_warning:
    858     raise ValueError(
    859         "Complex data not supported\n{}\n".format(array)
    860     ) from complex_warning

File /usr/local/lib/python3.9/dist-packages/pandas/core/generic.py:2069, in NDFrame.__array__(self, dtype)
   2068 def __array__(self, dtype: npt.DTypeLike | None = None) -> np.ndarray:
-> 2069     return np.asarray(self._values, dtype=dtype)

ValueError: setting an array element with a sequence.

which to my understanding is telling me that I need a single sized array for y_train, but y_train is already single sized? I am just lost and don’t know what I am doing wrong, I would realy love some help with this <3

It looks like your dataset is a mixed bag of lists (lists of lists), strings, and integers. Can you try out to set the type of your numpy arrays as object?
Alternatively can you cast elements in your filename columns to np.array() ?

Hi, thanks for you feedback! right now I am indeed transforming the images to a np.array(), but that does not seem to work :frowning: The only other type I have are my class names, the rest are ints

any other thoughts? I am also willing to change my dataset format if needed