Hi, I am trying to create a model with my csv dataset. The dataset has this CSV format:
filename,width,height,class,xmin,ymin,ymax,xmax
6774t.jpg,6016,4016,microchip,3698,65,395,4028
6774t.jpg,6016,4016,connector,3028,814,1453,3102
6774t.jpg,6016,4016,microchip,3362,1901,2132,3560
I am trying to train the model in a notebook on a paperspace virtual machine. here is the code that I use to import my dataset:
np.set_printoptions(precision=3, suppress=True)
pcb_train = pd.read_csv(
'/notebooks/CSV/Photos/Dataset.csv')
pcb_train.head()
After that I convert the images to numeric arrays:
from PIL import Image
import numpy as np
def load_data(filename): # Assuming desired image size
"""Loads image, resizes, converts to NumPy array, and extracts label."""
image_path = f"/notebooks/CSV/Photos/{filename}" # Assuming image path format
img = Image.open(image_path)
img_array = np.array(img)
# Assuming 'class' is the label column in your DataFrame
label_row = pcb_train[pcb_train['filename'] == filename]
label = label_row['class'].values[0]
return img_array, label
feature_array = []
oldfilename = ''
processed_count = 0
for index, row in pcb_train.iterrows():
filename = row['filename']
try:
# Check if filename is the same as the previous one
if filename == oldfilename:
# If same, reuse the already loaded image array (efficiency)
feature_array.append(img_array)
processed_count += 1
new_progress_message = f"Progress: {processed_count}/{len(pcb_train)}"
oldfilename = filename
else:
# If different, load the image and label using the function
img_array, label = load_data(filename)
feature_array.append(img_array)
processed_count += 1
new_progress_message = f"Progress: {processed_count}/{len(pcb_train)}"
oldfilename = filename
print(f"\r{new_progress_message}", end="")
except (FileNotFoundError, KeyError) as e:
print(f"Error processing file: {filename} - {e}")
# Consider logging the error or taking specific actions
pcb_train['filename'] = feature_array
new_progress_message = "Progress: finished"
print(f"\r{new_progress_message}", end="")
pcb_train.head()
this results in this format:
[[[51, 45, 9], [50, 44, 8], [49, 43, 7], [50, ... 6016 4016 microchip 3698 65 395 4028
now I still need to split the labels from the rest of the data, I did that this way:
pcb_features = pcb_train.copy()
pcb_labels = pcb_features.pop('class')
print(pcb_labels.head())
print(pcb_features.head())
which results in this:
0 microchip
1 connector
2 microchip
3 inductor
4 inductor
Name: class, dtype: object
filename width height xmin \
0 [[[51, 45, 9], [50, 44, 8], [49, 43, 7], [50, ... 6016 4016 3698
1 [[[51, 45, 9], [50, 44, 8], [49, 43, 7], [50, ... 6016 4016 3028
2 [[[51, 45, 9], [50, 44, 8], [49, 43, 7], [50, ... 6016 4016 3362
3 [[[51, 45, 9], [50, 44, 8], [49, 43, 7], [50, ... 6016 4016 5076
4 [[[51, 45, 9], [50, 44, 8], [49, 43, 7], [50, ... 6016 4016 5076
ymin ymax xmax
0 65 395 4028
1 814 1453 3102
2 1901 2132 3560
3 1420 1589 5248
4 869 1041 5248
but now when I try to run train the model with this code:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(pcb_features, pcb_labels, test_size=0.2, random_state=42)
# print(X_train.head)
# print(y_train.head)
# Define a Logistic Regression model
model = LogisticRegression(solver='lbfgs')
model.fit(X_train, y_train)
I get this error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
TypeError: only size-1 arrays can be converted to Python scalars
The above exception was the direct cause of the following exception:
ValueError Traceback (most recent call last)
Cell In [81], line 16
10 # print(X_train.head)
11 # print(y_train.head)
12
13 # Define a Logistic Regression model
14 model = LogisticRegression(solver='lbfgs')
---> 16 model.fit(X_train, y_train)
File /usr/local/lib/python3.9/dist-packages/sklearn/linear_model/_logistic.py:1138, in LogisticRegression.fit(self, X, y, sample_weight)
1135 else:
1136 _dtype = [np.float64, np.float32]
-> 1138 X, y = self._validate_data(
1139 X,
1140 y,
1141 accept_sparse="csr",
1142 dtype=_dtype,
1143 order="C",
1144 accept_large_sparse=solver not in ["liblinear", "sag", "saga"],
1145 )
1146 check_classification_targets(y)
1147 self.classes_ = np.unique(y)
File /usr/local/lib/python3.9/dist-packages/sklearn/base.py:596, in BaseEstimator._validate_data(self, X, y, reset, validate_separately, **check_params)
594 y = check_array(y, input_name="y", **check_y_params)
595 else:
--> 596 X, y = check_X_y(X, y, **check_params)
597 out = X, y
599 if not no_val_X and check_params.get("ensure_2d", True):
File /usr/local/lib/python3.9/dist-packages/sklearn/utils/validation.py:1074, in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)
1069 estimator_name = _check_estimator_name(estimator)
1070 raise ValueError(
1071 f"{estimator_name} requires y to be passed, but the target y is None"
1072 )
-> 1074 X = check_array(
1075 X,
1076 accept_sparse=accept_sparse,
1077 accept_large_sparse=accept_large_sparse,
1078 dtype=dtype,
1079 order=order,
1080 copy=copy,
1081 force_all_finite=force_all_finite,
1082 ensure_2d=ensure_2d,
1083 allow_nd=allow_nd,
1084 ensure_min_samples=ensure_min_samples,
1085 ensure_min_features=ensure_min_features,
1086 estimator=estimator,
1087 input_name="X",
1088 )
1090 y = _check_y(y, multi_output=multi_output, y_numeric=y_numeric, estimator=estimator)
1092 check_consistent_length(X, y)
File /usr/local/lib/python3.9/dist-packages/sklearn/utils/validation.py:856, in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name)
854 array = array.astype(dtype, casting="unsafe", copy=False)
855 else:
--> 856 array = np.asarray(array, order=order, dtype=dtype)
857 except ComplexWarning as complex_warning:
858 raise ValueError(
859 "Complex data not supported\n{}\n".format(array)
860 ) from complex_warning
File /usr/local/lib/python3.9/dist-packages/pandas/core/generic.py:2069, in NDFrame.__array__(self, dtype)
2068 def __array__(self, dtype: npt.DTypeLike | None = None) -> np.ndarray:
-> 2069 return np.asarray(self._values, dtype=dtype)
ValueError: setting an array element with a sequence.
which to my understanding is telling me that I need a single sized array for y_train, but y_train is already single sized? I am just lost and don’t know what I am doing wrong, I would realy love some help with this <3